UCI Adult Dataset MLP SKLearn

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

def normalize_data(X_train, X_test):
    scaler = MinMaxScaler()
    # scaler.fit(X_train)
    # X_train = scaler.transform(X_train)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test


def evaluate_model(y_test, y_pred):
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print(accuracy_score(y_test, y_pred))

In [None]:
# Load Data
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
column_names = ['age',
         'workclass',
         'fnlwgt',
         'education',
         'education_num',
         'marital_status',
         'occupation',
         'relationship',
         'race',
         'sex',
         'capital_gain',
         'capital_loss',
         'hours_per_week',
         'native_country',
         'income']

df = pd.read_csv(url, names=column_names)
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [None]:
df.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education_num,0
marital_status,0
occupation,0
relationship,0
race,0
sex,0


In [None]:
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [None]:
# df['income'] = df['income'].map({' <=50K': 0, ' >50K': 1})
df['income'] = df['income'].replace([' <=50K', ' >50K'], [0, 1])
df

  df['income'] = df['income'].replace([' <=50K', ' >50K'], [0, 1])


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,1
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,0


In [None]:
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,income
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456,0.24081
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429,0.427581
min,17.0,12285.0,1.0,0.0,0.0,1.0,0.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0,0.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0,0.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0,0.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0,1.0


In [None]:
df_num = df.select_dtypes(include=[np.number])
df_num

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,income
0,39,77516,13,2174,0,40,0
1,50,83311,13,0,0,13,0
2,38,215646,9,0,0,40,0
3,53,234721,7,0,0,40,0
4,28,338409,13,0,0,40,0
...,...,...,...,...,...,...,...
32556,27,257302,12,0,0,38,0
32557,40,154374,9,0,0,40,1
32558,58,151910,9,0,0,40,0
32559,22,201490,9,0,0,20,0


In [None]:
data = df_num.values
data

array([[    39,  77516,     13, ...,      0,     40,      0],
       [    50,  83311,     13, ...,      0,     13,      0],
       [    38, 215646,      9, ...,      0,     40,      0],
       ...,
       [    58, 151910,      9, ...,      0,     40,      0],
       [    22, 201490,      9, ...,      0,     20,      0],
       [    52, 287927,      9, ...,      0,     40,      1]])

In [None]:
X = data[:, :-1]
y = data[:, -1]
X, y

(array([[    39,  77516,     13,   2174,      0,     40],
        [    50,  83311,     13,      0,      0,     13],
        [    38, 215646,      9,      0,      0,     40],
        ...,
        [    58, 151910,      9,      0,      0,     40],
        [    22, 201490,      9,      0,      0,     20],
        [    52, 287927,      9,  15024,      0,     40]]),
 array([0, 0, 0, ..., 0, 0, 1]))

In [None]:
from sklearn.model_selection import train_test_split
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((26048, 6), (6513, 6), (26048,), (6513,))

In [None]:
# Normalization
X_train, X_test = normalize_data(X_train, X_test)
X_test, X_train

(array([[0.1369863 , 0.10044213, 0.6       , 0.        , 0.        ,
         0.37755102],
        [0.38356164, 0.02599938, 0.53333333, 0.        , 0.        ,
         0.39795918],
        [0.16438356, 0.11791676, 0.8       , 0.        , 0.        ,
         0.55102041],
        ...,
        [0.4109589 , 0.23217764, 0.93333333, 1.        , 0.        ,
         0.47959184],
        [0.1369863 , 0.02130302, 0.8       , 0.        , 0.        ,
         0.39795918],
        [0.01369863, 0.0092039 , 0.53333333, 0.        , 0.        ,
         0.24489796]]),
 array([[0.21917808, 0.12625338, 0.8       , 0.        , 0.        ,
         0.5       ],
        [0.26027397, 0.05037557, 0.66666667, 0.        , 0.43319559,
         0.5       ],
        [0.56164384, 0.12955135, 0.26666667, 0.        , 0.        ,
         0.39795918],
        ...,
        [0.01369863, 0.13854675, 0.4       , 0.        , 0.        ,
         0.19387755],
        [0.45205479, 0.02850817, 0.53333333, 0.        , 0.   

## MLP SKLearn

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
clf = MLPClassifier(hidden_layer_sizes=(100, 100),
                    max_iter=1000,
                    random_state=42,
                    verbose=True)

In [None]:
clf.get_params()

{'activation': 'relu',
 'alpha': 0.0001,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': (100, 100),
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'max_fun': 15000,
 'max_iter': 1000,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': 42,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': True,
 'warm_start': False}

In [None]:
clf.fit(X_train, y_train)

Iteration 1, loss = 0.50585335
Iteration 2, loss = 0.40802479
Iteration 3, loss = 0.39850708
Iteration 4, loss = 0.39335929
Iteration 5, loss = 0.38996501
Iteration 6, loss = 0.39051954
Iteration 7, loss = 0.38974381
Iteration 8, loss = 0.38852623
Iteration 9, loss = 0.38907682
Iteration 10, loss = 0.38818040
Iteration 11, loss = 0.38840989
Iteration 12, loss = 0.38815961
Iteration 13, loss = 0.38737578
Iteration 14, loss = 0.38691832
Iteration 15, loss = 0.38755209
Iteration 16, loss = 0.38620069
Iteration 17, loss = 0.38706139
Iteration 18, loss = 0.38619703
Iteration 19, loss = 0.38547936
Iteration 20, loss = 0.38586623
Iteration 21, loss = 0.38647494
Iteration 22, loss = 0.38474588
Iteration 23, loss = 0.38541237
Iteration 24, loss = 0.38507996
Iteration 25, loss = 0.38442387
Iteration 26, loss = 0.38465053
Iteration 27, loss = 0.38539135
Iteration 28, loss = 0.38426434
Iteration 29, loss = 0.38488928
Iteration 30, loss = 0.38380552
Iteration 31, loss = 0.38390801
Iteration 32, los

In [None]:
y_pred = clf.predict(X_test)
y_pred

array([0, 0, 0, ..., 1, 0, 0])

In [None]:
evaluate_model(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.84      0.96      0.89      4942
           1       0.76      0.42      0.54      1571

    accuracy                           0.83      6513
   macro avg       0.80      0.69      0.72      6513
weighted avg       0.82      0.83      0.81      6513

[[4734  208]
 [ 909  662]]
0.8284968524489482
