In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

We're ready to build our first neural network. We will have multiple features we feed into our model, each of which will go through a set of perceptron models to arrive at a response which will be trained to our output.

Like many models we've covered, this can be used as both a regression or classification model.

First, we need to load our dataset. For this example we'll use The Museum of Modern Art in New York's [public dataset](https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv) on their collection.

In [2]:
artworks = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')

In [3]:
artworks.columns

Index(['Title', 'Artist', 'ConstituentID', 'ArtistBio', 'Nationality',
       'BeginDate', 'EndDate', 'Gender', 'Date', 'Medium', 'Dimensions',
       'CreditLine', 'AccessionNumber', 'Classification', 'Department',
       'DateAcquired', 'Cataloged', 'ObjectID', 'URL', 'ThumbnailURL',
       'Circumference (cm)', 'Depth (cm)', 'Diameter (cm)', 'Height (cm)',
       'Length (cm)', 'Weight (kg)', 'Width (cm)', 'Seat Height (cm)',
       'Duration (sec.)'],
      dtype='object')

We'll also do a bit of data processing and cleaning, selecting columns of interest and converting URL's to booleans indicating whether they are present.

In [4]:
# Select Columns.
artworks = artworks[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                     'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]

# Convert URL's to booleans.
artworks['URL'] = artworks['URL'].notnull()
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()

# Drop films and some other tricky rows.
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!='Media and Performance Art']
artworks = artworks[artworks['Department']!='Fluxus Collection']

# Drop missing data.
artworks = artworks.dropna()

In [5]:
artworks.head()

Unnamed: 0,Artist,Nationality,Gender,Date,Department,DateAcquired,URL,ThumbnailURL,Height (cm),Width (cm)
0,Otto Wagner,(Austrian),(Male),1896,Architecture & Design,1996-04-09,True,True,48.6,168.9
1,Christian de Portzamparc,(French),(Male),1987,Architecture & Design,1995-01-17,True,True,40.6401,29.8451
2,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,34.3,31.8
3,Bernard Tschumi,(),(Male),1980,Architecture & Design,1995-01-17,True,True,50.8,50.8
4,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,38.4,19.1


## Building a Model

Now, let's see if we can use multi-layer perceptron modeling (or "MLP") to see if we can classify the department a piece should go into using everything but the department name.

Before we import MLP from SKLearn and establish the model we first have to ensure correct typing for our data and do some other cleaning.

In [6]:
# Get data types.
artworks.dtypes

Artist           object
Nationality      object
Gender           object
Date             object
Department       object
DateAcquired     object
URL                bool
ThumbnailURL       bool
Height (cm)     float64
Width (cm)      float64
dtype: object

The `DateAcquired` column is an object. Let's transform that to a datetime object and add a feature for just the year the artwork was acquired.

In [7]:
artworks['DateAcquired'] = pd.to_datetime(artworks.DateAcquired)
artworks['YearAcquired'] = artworks.DateAcquired.dt.year
artworks['YearAcquired'].dtype

dtype('int64')

Great. Let's do some more miscellaneous cleaning.

In [8]:
# Remove multiple nationalites, genders, and artists.
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks.loc[artworks['Artist'].str.contains('\) \('), 'Artist'] = 'Multiple_Artists'

# Convert dates to start date, cutting down the number of distinct examples.
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

# Final column drops and NA drop.
X = artworks.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)

# Create dummies separately.
artists = pd.get_dummies(artworks.Artist)
nationalities = pd.get_dummies(artworks.Nationality)
dates = pd.get_dummies(artworks.Date)

# Concat with other variables, but artists slows this wayyyyy down so we'll keep it out for now.
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)

Y = artworks.Department

In [9]:
# Alright! We've done our prep, let's build the model.
# Neural networks are hugely computationally intensive.
# This may take several minutes to run.

# Import the model.
from sklearn.neural_network import MLPClassifier

# Establish and fit the model, with a single, 1000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(1000,), verbose=True)
mlp.fit(X, Y)

Iteration 1, loss = 4.60889626
Iteration 2, loss = 2.96428507
Iteration 3, loss = 2.21676705
Iteration 4, loss = 2.13812748
Iteration 5, loss = 1.91064622
Iteration 6, loss = 1.82439544
Iteration 7, loss = 1.55706938
Iteration 8, loss = 1.60855446
Iteration 9, loss = 1.60626746
Iteration 10, loss = 1.24729755
Iteration 11, loss = 1.37269107
Iteration 12, loss = 1.22761722
Iteration 13, loss = 1.13797268
Iteration 14, loss = 1.16756439
Iteration 15, loss = 1.02990274
Iteration 16, loss = 0.89113534
Iteration 17, loss = 0.90703326
Iteration 18, loss = 0.96418154
Iteration 19, loss = 0.84229326
Iteration 20, loss = 0.85485855
Iteration 21, loss = 0.86752232
Iteration 22, loss = 0.75571864
Iteration 23, loss = 0.76623655
Iteration 24, loss = 0.78621612
Iteration 25, loss = 0.72794386
Iteration 26, loss = 0.70035965
Iteration 27, loss = 0.70989074
Iteration 28, loss = 0.68344959
Iteration 29, loss = 0.68172422
Iteration 30, loss = 0.68838371
Iteration 31, loss = 0.67768426
Iteration 32, los



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(1000,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=True, warm_start=False)

In [10]:
mlp.score(X, Y)

0.7991084640426162

In [11]:
Y.value_counts()/len(Y)

Drawings & Prints        0.623002
Photography              0.226630
Architecture & Design    0.113171
Painting & Sculpture     0.033590
Media and Performance    0.003607
Name: Department, dtype: float64

In [12]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X, Y, cv=5)

Iteration 1, loss = 4.74084135
Iteration 2, loss = 2.62895523
Iteration 3, loss = 2.16289850
Iteration 4, loss = 2.11132903
Iteration 5, loss = 2.11377067
Iteration 6, loss = 2.18341611
Iteration 7, loss = 1.62640417
Iteration 8, loss = 1.82345288
Iteration 9, loss = 1.48023386
Iteration 10, loss = 1.40728045
Iteration 11, loss = 1.71189886
Iteration 12, loss = 1.52286531
Iteration 13, loss = 1.49643374
Iteration 14, loss = 1.18710421
Iteration 15, loss = 1.15618300
Iteration 16, loss = 1.04456330
Iteration 17, loss = 1.09965041
Iteration 18, loss = 1.02794767
Iteration 19, loss = 0.92497269
Iteration 20, loss = 1.03141145
Iteration 21, loss = 0.86539113
Iteration 22, loss = 0.88219371
Iteration 23, loss = 0.80553192
Iteration 24, loss = 0.86955846
Iteration 25, loss = 0.87816168
Iteration 26, loss = 0.73696630
Iteration 27, loss = 0.75655997
Iteration 28, loss = 0.81056532
Iteration 29, loss = 0.70027449
Iteration 30, loss = 0.74350654
Iteration 31, loss = 0.74356107
Iteration 32, los



Iteration 1, loss = 4.63793770
Iteration 2, loss = 2.64440439
Iteration 3, loss = 2.66928947
Iteration 4, loss = 1.73016404
Iteration 5, loss = 1.99765747
Iteration 6, loss = 1.88915073
Iteration 7, loss = 1.96718907
Iteration 8, loss = 1.62855581
Iteration 9, loss = 1.79731110
Iteration 10, loss = 1.76365711
Iteration 11, loss = 1.14517729
Iteration 12, loss = 1.39131406
Iteration 13, loss = 1.39114385
Iteration 14, loss = 1.06606297
Iteration 15, loss = 1.28409453
Iteration 16, loss = 1.02124096
Iteration 17, loss = 1.04543282
Iteration 18, loss = 1.03337407
Iteration 19, loss = 0.91729102
Iteration 20, loss = 1.00020056
Iteration 21, loss = 0.85314179
Iteration 22, loss = 0.86828027
Iteration 23, loss = 0.86193965
Iteration 24, loss = 0.85470508
Iteration 25, loss = 0.83153515
Iteration 26, loss = 0.77921100
Iteration 27, loss = 0.76103793
Iteration 28, loss = 0.78597715
Iteration 29, loss = 0.74156834
Iteration 30, loss = 0.74195632
Iteration 31, loss = 0.72427307
Iteration 32, los

Iteration 68, loss = 0.57567986
Iteration 69, loss = 0.57862635
Iteration 70, loss = 0.57316058
Iteration 71, loss = 0.57565459
Iteration 72, loss = 0.57653915
Iteration 73, loss = 0.57235530
Iteration 74, loss = 0.56525578
Iteration 75, loss = 0.57021817
Iteration 76, loss = 0.56409256
Iteration 77, loss = 0.56311179
Iteration 78, loss = 0.56020530
Iteration 79, loss = 0.56454853
Iteration 80, loss = 0.56339263
Iteration 81, loss = 0.55952116
Iteration 82, loss = 0.55480782
Iteration 83, loss = 0.55846960
Iteration 84, loss = 0.55860739
Iteration 85, loss = 0.55977626
Iteration 86, loss = 0.55401099
Iteration 87, loss = 0.55005081
Iteration 88, loss = 0.54972319
Iteration 89, loss = 0.55443862
Iteration 90, loss = 0.55447853
Iteration 91, loss = 0.54709419
Iteration 92, loss = 0.54965786
Iteration 93, loss = 0.54842604
Iteration 94, loss = 0.54682394
Iteration 95, loss = 0.54738479
Iteration 96, loss = 0.54651831
Iteration 97, loss = 0.54059690
Iteration 98, loss = 0.54865896
Iteratio

Iteration 172, loss = 0.49316306
Iteration 173, loss = 0.49331968
Iteration 174, loss = 0.49336420
Iteration 175, loss = 0.49557104
Iteration 176, loss = 0.49798995
Iteration 177, loss = 0.49340742
Iteration 178, loss = 0.49627896
Iteration 179, loss = 0.49626733
Iteration 180, loss = 0.49233660
Iteration 181, loss = 0.49620630
Iteration 182, loss = 0.49444888
Iteration 183, loss = 0.49280672
Iteration 184, loss = 0.49691214
Iteration 185, loss = 0.49460369
Iteration 186, loss = 0.49624872
Iteration 187, loss = 0.49178000
Iteration 188, loss = 0.49219987
Iteration 189, loss = 0.49414602
Iteration 190, loss = 0.49243320
Iteration 191, loss = 0.48508746
Iteration 192, loss = 0.49273088
Iteration 193, loss = 0.48983922
Iteration 194, loss = 0.48922634
Iteration 195, loss = 0.49143423
Iteration 196, loss = 0.49156329
Iteration 197, loss = 0.49053017
Iteration 198, loss = 0.48603006
Iteration 199, loss = 0.49102746
Iteration 200, loss = 0.49280986




Iteration 1, loss = 4.30961792
Iteration 2, loss = 2.27760381
Iteration 3, loss = 1.94210251
Iteration 4, loss = 2.06943210
Iteration 5, loss = 1.61786074
Iteration 6, loss = 1.85499719
Iteration 7, loss = 1.41043792
Iteration 8, loss = 1.57821844
Iteration 9, loss = 1.46928551
Iteration 10, loss = 1.39594778
Iteration 11, loss = 1.37151501
Iteration 12, loss = 1.14623327
Iteration 13, loss = 1.10904971
Iteration 14, loss = 1.30651737
Iteration 15, loss = 1.20303723
Iteration 16, loss = 1.00410917
Iteration 17, loss = 1.11484190
Iteration 18, loss = 0.91376671
Iteration 19, loss = 0.95455258
Iteration 20, loss = 0.82563927
Iteration 21, loss = 0.92409572
Iteration 22, loss = 0.86186440
Iteration 23, loss = 0.79801274
Iteration 24, loss = 0.79544578
Iteration 25, loss = 0.78399373
Iteration 26, loss = 0.77243432
Iteration 27, loss = 0.74949772
Iteration 28, loss = 0.69187358
Iteration 29, loss = 0.70903613
Iteration 30, loss = 0.73144361
Iteration 31, loss = 0.69814443
Iteration 32, los

array([0.70051325, 0.74226661, 0.64004624, 0.66825434, 0.58365705])

Now we got a lot of information from all of this. Firstly we can see that the model seems to overfit, though there is still so remaining performance when validated with cross validation. This is a feature of neural networks that aren't given enough data for the number of features present. _Neural networks, in general, like_ a lot _of data_. You may also have noticed something also about neural networks: _they can take a_ long _time to run_. Try increasing the layer size by adding a zero. Feel free to interrupt the kernel if you don't have time...

Also note that we created bools for artist's name but left them out. Both of the above points are the reason for that. It would take much longer to run and it would be much more prone to overfitting.

## Model parameters

Now, before we move on and let you loose with some tasks to work on the model, let's go over the parameters.

We included one parameter: hidden layer size. Remember in the previous lesson, when we talked about layers in a neural network. This tells us how many and how big to make our layers. Pass in a tuple that specifies each layer's size. Our network is 1000 neurons wide and one layer. (100, 4, ) would create a network with two layers, one 100 wide and the other 4.

How many layers to include is determined by two things: computational resources and cross validation searching for convergence. It's generally less than the number of input variables you have.

You can also set an alpha. Neural networks like this use a regularization parameter that penalizes large coefficients just like we discussed in the advanced regression section. Alpha scales that penalty.

Lastly, we'll discuss the activation function. The activation function determines whether the output from an individual perceptron is binary or continuous. By default this is a 'relu', or 'rectified linear unit function' function. In the exercise we went through earlier we used this binary function, but we discussed the _sigmoid_ as a reasonable alternative. The _sigmoid_ (called 'logistic' by SKLearn because it's a 'logistic sigmoid function') allows for continuous variables between 0 and 1, which allows for a more nuanced model. It does come at the cost of increased computational complexity.

If you want to learn more about these, study [activation functions](https://en.wikipedia.org/wiki/Activation_function) and [multilayer perceptrons](https://en.wikipedia.org/wiki/Multilayer_perceptron). The [Deep Learning](http://www.deeplearningbook.org/) book referenced earlier goes into great detail on the linear algebra involved.

You could also just test the models with cross validation. Unless neural networks are your specialty cross validation should be sufficient.

For the other parameters and their defaults, check out the [MLPClassifier documentaiton](http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier).

## Drill: Playing with layers

Now it's your turn. Using the space below, experiment with different hidden layer structures. You can try this on a subset of the data to improve runtime. See how things vary. See what seems to matter the most. Feel free to manipulate other parameters as well. It may also be beneficial to do some real feature selection work...

In [13]:
# Let's try two 20-wide hidden layers.
mlpA = MLPClassifier(hidden_layer_sizes=(20, 20), verbose=True)
mlpA.fit(X, Y)

Iteration 1, loss = 1.75050702
Iteration 2, loss = 0.86536691
Iteration 3, loss = 0.82856837
Iteration 4, loss = 0.80939455
Iteration 5, loss = 0.79227904
Iteration 6, loss = 0.77681882
Iteration 7, loss = 0.77490690
Iteration 8, loss = 0.75992367
Iteration 9, loss = 0.75849648
Iteration 10, loss = 0.74551978
Iteration 11, loss = 0.74447248
Iteration 12, loss = 0.74076257
Iteration 13, loss = 0.73383024
Iteration 14, loss = 0.73834118
Iteration 15, loss = 0.72820069
Iteration 16, loss = 0.73508147
Iteration 17, loss = 0.72405298
Iteration 18, loss = 0.72254730
Iteration 19, loss = 0.72076640
Iteration 20, loss = 0.72372958
Iteration 21, loss = 0.72018949
Iteration 22, loss = 0.71156967
Iteration 23, loss = 0.71412137
Iteration 24, loss = 0.70848255
Iteration 25, loss = 0.70539301
Iteration 26, loss = 0.70137026
Iteration 27, loss = 0.69581869
Iteration 28, loss = 0.68858064
Iteration 29, loss = 0.68676141
Iteration 30, loss = 0.68066390
Iteration 31, loss = 0.68051388
Iteration 32, los

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20, 20), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=True, warm_start=False)

In [14]:
# Let's see about four 10-wide layers.
mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10, 10), verbose=True)
mlp.fit(X, Y)

Iteration 1, loss = 1.21262974
Iteration 2, loss = 0.94211320
Iteration 3, loss = 0.90421235
Iteration 4, loss = 0.86342987
Iteration 5, loss = 0.83669095
Iteration 6, loss = 0.82041437
Iteration 7, loss = 0.80572846
Iteration 8, loss = 0.79100408
Iteration 9, loss = 0.77845008
Iteration 10, loss = 0.76909039
Iteration 11, loss = 0.75927630
Iteration 12, loss = 0.74963734
Iteration 13, loss = 0.74373009
Iteration 14, loss = 0.73849886
Iteration 15, loss = 0.72245094
Iteration 16, loss = 0.71146972
Iteration 17, loss = 0.70787006
Iteration 18, loss = 0.69968362
Iteration 19, loss = 0.69212449
Iteration 20, loss = 0.68615811
Iteration 21, loss = 0.68355370
Iteration 22, loss = 0.67901077
Iteration 23, loss = 0.67928500
Iteration 24, loss = 0.67600687
Iteration 25, loss = 0.67247521
Iteration 26, loss = 0.67529189
Iteration 27, loss = 0.66869348
Iteration 28, loss = 0.66703309
Iteration 29, loss = 0.66504500
Iteration 30, loss = 0.66541042
Iteration 31, loss = 0.66326855
Iteration 32, los

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10, 10, 10, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=True, warm_start=False)

In [15]:
# How about some varied layers.
mlp = MLPClassifier(hidden_layer_sizes=(30, 15, 30,), verbose=True)
mlp.fit(X, Y)

Iteration 1, loss = 1.62444573
Iteration 2, loss = 0.87938951
Iteration 3, loss = 0.83535881
Iteration 4, loss = 0.80122410
Iteration 5, loss = 0.78369105
Iteration 6, loss = 0.75499569
Iteration 7, loss = 0.74225950
Iteration 8, loss = 0.74130047
Iteration 9, loss = 0.72996599
Iteration 10, loss = 0.71566827
Iteration 11, loss = 0.71345415
Iteration 12, loss = 0.71261905
Iteration 13, loss = 0.70208299
Iteration 14, loss = 0.69642978
Iteration 15, loss = 0.69076622
Iteration 16, loss = 0.68659020
Iteration 17, loss = 0.68984147
Iteration 18, loss = 0.68225905
Iteration 19, loss = 0.67709109
Iteration 20, loss = 0.68079639
Iteration 21, loss = 0.67463487
Iteration 22, loss = 0.66666086
Iteration 23, loss = 0.66966146
Iteration 24, loss = 0.66686775
Iteration 25, loss = 0.66065472
Iteration 26, loss = 0.66111574
Iteration 27, loss = 0.65818823
Iteration 28, loss = 0.65348575
Iteration 29, loss = 0.65867088
Iteration 30, loss = 0.65468794
Iteration 31, loss = 0.64887495
Iteration 32, los

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(30, 15, 30), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=True, warm_start=False)

In [16]:
# How about two much larger layers.
mlp = MLPClassifier(hidden_layer_sizes=(100, 100), verbose=True)
mlp.fit(X, Y)

Iteration 1, loss = 2.11069668
Iteration 2, loss = 1.22528509
Iteration 3, loss = 1.06292973
Iteration 4, loss = 1.03687771
Iteration 5, loss = 0.92261764
Iteration 6, loss = 0.85493033
Iteration 7, loss = 0.85132886
Iteration 8, loss = 0.81987830
Iteration 9, loss = 0.85333279
Iteration 10, loss = 0.77014821
Iteration 11, loss = 0.78088211
Iteration 12, loss = 0.75000927
Iteration 13, loss = 0.72458290
Iteration 14, loss = 0.72899398
Iteration 15, loss = 0.71945490
Iteration 16, loss = 0.70697148
Iteration 17, loss = 0.70404150
Iteration 18, loss = 0.69284284
Iteration 19, loss = 0.68378906
Iteration 20, loss = 0.68030933
Iteration 21, loss = 0.68368222
Iteration 22, loss = 0.67240077
Iteration 23, loss = 0.67192473
Iteration 24, loss = 0.65729368
Iteration 25, loss = 0.65221005
Iteration 26, loss = 0.65694181
Iteration 27, loss = 0.65761875
Iteration 28, loss = 0.65296657
Iteration 29, loss = 0.64027271
Iteration 30, loss = 0.63193281
Iteration 31, loss = 0.64023142
Iteration 32, los

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=True, warm_start=False)

It seems that no matter what type of hidden layer structure I make, I can't get the model to converge.

In [17]:
# How about two MUCH larger layers.
mlp = MLPClassifier(hidden_layer_sizes=(300, 700), verbose=True)
mlp.fit(X, Y)

Iteration 1, loss = 4.98013589
Iteration 2, loss = 1.48987650
Iteration 3, loss = 1.14307520
Iteration 4, loss = 0.84651079
Iteration 5, loss = 0.81422099
Iteration 6, loss = 0.78987977
Iteration 7, loss = 0.76437225
Iteration 8, loss = 0.71190243
Iteration 9, loss = 0.71718418
Iteration 10, loss = 0.69790218
Iteration 11, loss = 0.68505362
Iteration 12, loss = 0.66893865
Iteration 13, loss = 0.65962135
Iteration 14, loss = 0.65823545
Iteration 15, loss = 0.64706502
Iteration 16, loss = 0.63710117
Iteration 17, loss = 0.63085156
Iteration 18, loss = 0.62411772
Iteration 19, loss = 0.62097042
Iteration 20, loss = 0.61510261
Iteration 21, loss = 0.61004363
Iteration 22, loss = 0.61146759
Iteration 23, loss = 0.59864040
Iteration 24, loss = 0.61025003
Iteration 25, loss = 0.59669431
Iteration 26, loss = 0.59409336
Iteration 27, loss = 0.58648165
Iteration 28, loss = 0.57985113
Iteration 29, loss = 0.57465075
Iteration 30, loss = 0.57110995
Iteration 31, loss = 0.56433498
Iteration 32, los



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(300, 700), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=True, warm_start=False)