In [1]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from mypipes_nn import *
import warnings
warnings.filterwarnings('ignore')

In [2]:
file=r'/Users/lalitsachan/Dropbox/0.0 Data/census_income.csv'

ci_train=pd.read_csv(file)

In [3]:
ci_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,Y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
ci_train['Y'].value_counts()

 <=50K    24720
 >50K      7841
Name: Y, dtype: int64

In [5]:
# pd.crosstab(ci_train['education'],ci_train['education.num'])

In [6]:
cat_vars=list(ci_train.select_dtypes(include=['object']).columns)

In [7]:
cat_vars=[_ for _ in cat_vars if _ not in ['Y','education']]

In [8]:
num_vars=list(ci_train.select_dtypes(exclude=['object']).columns)

In [9]:
p1=pdPipeline([
    ('cat_select',VarSelector(cat_vars)),
    ('missing_trt',DataFrameImputer()),
    ('create_dummies',get_dummies_Pipe(300))
])

p2=pdPipeline([
    ('num_select',VarSelector(num_vars)),
    ('missing_trt',DataFrameImputer()),
    ('standardise',pdStdScaler())
])

data_pipe=FeatureUnion([
    ('cat_pipe',p1),
    ('num_pipe',p2)
])

In [10]:
x_train=pd.DataFrame(data=data_pipe.fit_transform(ci_train),
                     columns=data_pipe.get_feature_names())

In [11]:
ci_train['Y'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [12]:
y_train=(ci_train['Y']==' >50K').astype(int)

In [13]:
x_train.shape

(32561, 45)

In [20]:
MLPClassifier?

In [15]:
parameters={
    
'learning_rate': ["constant", "invscaling", "adaptive"],
'hidden_layer_sizes': [(25,10,5),(20,10),(10,20)],

'alpha': [0.3,.1,.01],
'activation': ["relu", "logistic", "tanh"]

}

In [16]:
clf=MLPClassifier()

In [18]:
random_search=RandomizedSearchCV(clf,
                                 n_iter=5,
                                 cv=10,
                                 param_distributions=parameters,
                                 scoring='roc_auc',
                                 random_state=2,
                                 n_jobs=-1,verbose=20)

In [19]:
random_search.fit(x_train,y_train)

Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   27.8s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   29.3s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   38.9s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   39.8s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   42.6s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   44.3s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   50.1s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   50.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   54.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   54.0s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   58.4s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   59.2s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  

RandomizedSearchCV(cv=10, estimator=MLPClassifier(), n_iter=5, n_jobs=-1,
                   param_distributions={'activation': ['relu', 'logistic',
                                                       'tanh'],
                                        'alpha': [0.3, 0.1, 0.01],
                                        'hidden_layer_sizes': [(25, 10, 5),
                                                               (20, 10),
                                                               (10, 20)],
                                        'learning_rate': ['constant',
                                                          'invscaling',
                                                          'adaptive']},
                   random_state=2, scoring='roc_auc', verbose=20)

In [None]:
random_search.best_estimator_

In [21]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.5f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [22]:
report(random_search.cv_results_,5)

Model with rank: 1
Mean validation score: 0.915 (std: 0.00390)
Parameters: {'learning_rate': 'constant', 'hidden_layer_sizes': (10, 20), 'alpha': 0.01, 'activation': 'tanh'}

Model with rank: 2
Mean validation score: 0.912 (std: 0.00462)
Parameters: {'learning_rate': 'invscaling', 'hidden_layer_sizes': (10, 20), 'alpha': 0.01, 'activation': 'relu'}

Model with rank: 3
Mean validation score: 0.910 (std: 0.00415)
Parameters: {'learning_rate': 'adaptive', 'hidden_layer_sizes': (25, 10, 5), 'alpha': 0.01, 'activation': 'tanh'}

Model with rank: 4
Mean validation score: 0.906 (std: 0.00562)
Parameters: {'learning_rate': 'constant', 'hidden_layer_sizes': (20, 10), 'alpha': 0.3, 'activation': 'logistic'}

Model with rank: 5
Mean validation score: 0.906 (std: 0.00556)
Parameters: {'learning_rate': 'adaptive', 'hidden_layer_sizes': (25, 10, 5), 'alpha': 0.3, 'activation': 'logistic'}



In [23]:
mlp=random_search.best_estimator_

In [24]:
mlp.fit(x_train,y_train)

MLPClassifier(activation='tanh', alpha=0.01, hidden_layer_sizes=(10, 20))

In [25]:
mlp.intercepts_

[array([ 0.26992119, -0.28211629, -0.48877695, -0.38443483, -0.38211888,
         0.20242105, -0.35973156, -0.62138772, -0.09107568,  0.41545382]),
 array([-0.04667213,  0.26299529,  0.18826465,  0.56665959,  0.25689149,
        -0.26050866, -0.16225395,  0.38046638,  0.46714135, -0.2699965 ,
         0.27967009, -0.39382643, -0.39754854,  0.29977025, -0.18165086,
         0.41195177,  0.33888299,  0.02493884, -0.1478562 , -0.27776636]),
 array([-0.37289285])]

In [32]:
mlp.coefs_[2].shape

(20, 1)

In [33]:
mlp.predict_proba(x_train)

array([[9.75570826e-01, 2.44291742e-02],
       [8.50157955e-01, 1.49842045e-01],
       [9.84050836e-01, 1.59491644e-02],
       ...,
       [9.71552561e-01, 2.84474386e-02],
       [9.99625100e-01, 3.74899922e-04],
       [2.53142859e-03, 9.97468571e-01]])

In [34]:
mlp.classes_

array([0, 1])

In [35]:
mlp.predict_proba(x_train)[:,1]

array([2.44291742e-02, 1.49842045e-01, 1.59491644e-02, ...,
       2.84474386e-02, 3.74899922e-04, 9.97468571e-01])