In [12]:
! pip install scikeras

import numpy as np
import pandas as pd
from sklearn import preprocessing
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import RMSprop
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score



In [2]:
np.random.seed(42)
tf.random.set_seed(42)

In [15]:
Train = pd.read_csv("USCensusTraining.csv", na_values=[" ?","?","? "])
Train['income'] = Train['income'].map({'>50K.':1,'<=50K.':0})
Train = Train[Train['native-country'] != 'Holand-Netherlands']

cat_cols = Train.select_dtypes(exclude=['number']).columns.tolist()

for col in cat_cols:
    dominant_value = Train[col].mode()[0]
    Train[col] = Train[col].fillna(dominant_value)

X_t = Train.drop(columns=["income"])
y_t = Train["income"]

X_train, X_test, y_train, y_test = train_test_split(X_t, y_t, test_size=0.2, random_state=42)

# 3) Column types


numcol = X_train.select_dtypes(include=['number']).columns
catcol = X_train.select_dtypes(exclude=['number']).columns


# 4) Preprocess: impute+scale numeric, impute+onehot categorical
numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", preprocessing.MinMaxScaler())
])

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))  # use sparse=False if needed
])


preprocessor = ColumnTransformer(
    transformers= [
        ("num", numeric_pipe, numcol),
        ("cat", categorical_pipe, catcol)
    ], remainder="drop"
)

X_train_prepared = preprocessor.fit_transform(X_train)
X_test_prepared  = preprocessor.transform(X_test)


#pd.DataFrame(X_train_prepared).describe()
#pd.DataFrame(X_test_prepared).describe()

In [4]:
def build_model(optimizer="RMSprop", lr=0.001, units=32, act="sigmoid"):
    model = Sequential(name="ANN_Tunable")
    model.add(Input(shape=(X_train_prepared.shape[1],), name="input_features")) 
    model.add(Dense(units, activation=act, kernel_initializer="glorot_uniform", name="hidden"))
    model.add(Dense(1, activation="sigmoid", name="output"))
    
    if isinstance(optimizer, str):
        opt_name = optimizer.lower()
        if opt_name == "rmsprop":
            opt = tf.keras.optimizers.RMSprop(learning_rate=lr)
        elif opt_name in ("sgd", "sgd_m"):
            opt = tf.keras.optimizers.SGD(learning_rate=lr, momentum=0.9)
        else:
            opt = tf.keras.optimizers.get(optimizer)
    else:
        opt = optimizer
        
    model.compile(optimizer=opt,loss="binary_crossentropy", metrics=["accuracy"])
    return model


In [5]:
clf = KerasClassifier(model=build_model, epochs=40, batch_size=32, verbose=2, random_state=42)
param_grid = {
    "model__units": [16, 32, 64],
    "model__optimizer": ["RMSprop", "sgd"],
    "model__lr": [0.01, 0.001],
    "model__act": ["relu","sigmoid"],
    "epochs": [20],
    "batch_size": [32]   
}

random_search = GridSearchCV(clf, param_grid, n_jobs=-1, cv=3, scoring="accuracy", verbose=2)


In [None]:
Grid_search.fit(X_train_prepared, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
Epoch 1/20
625/625 - 1s - 2ms/step - accuracy: 0.8140 - loss: 0.3982
Epoch 2/20
625/625 - 1s - 1ms/step - accuracy: 0.8377 - loss: 0.3467
Epoch 3/20
625/625 - 1s - 1ms/step - accuracy: 0.8416 - loss: 0.3397
Epoch 4/20
625/625 - 1s - 1ms/step - accuracy: 0.8438 - loss: 0.3353
Epoch 5/20
625/625 - 1s - 980us/step - accuracy: 0.8463 - loss: 0.3321
Epoch 6/20
625/625 - 1s - 1ms/step - accuracy: 0.8479 - loss: 0.3296
Epoch 7/20
625/625 - 1s - 1ms/step - accuracy: 0.8490 - loss: 0.3274
Epoch 8/20
625/625 - 1s - 991us/step - accuracy: 0.8498 - loss: 0.3257
Epoch 9/20
625/625 - 1s - 1ms/step - accuracy: 0.8508 - loss: 0.3241
Epoch 10/20
625/625 - 1s - 977us/step - accuracy: 0.8518 - loss: 0.3228
Epoch 11/20
625/625 - 1s - 975us/step - accuracy: 0.8519 - loss: 0.3216
Epoch 12/20
625/625 - 1s - 1ms/step - accuracy: 0.8523 - loss: 0.3206
Epoch 13/20
625/625 - 1s - 1ms/step - accuracy: 0.8522 - loss: 0.3197
Epoch 14/20
625/625 - 1s - 1ms

0,1,2
,estimator,KerasClassifi..._weight=None )
,param_grid,"{'batch_size': [32], 'epochs': [20], 'model__act': ['relu', 'sigmoid'], 'model__lr': [0.01, 0.001], ...}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,model,<function bui...002022708F240>
,build_fn,
,warm_start,False
,random_state,42
,optimizer,'rmsprop'
,loss,
,metrics,
,batch_size,32
,validation_batch_size,
,verbose,2


In [7]:
print("Best params:", random_search.best_params_)
print("Best CV accuracy", random_search.best_score_)

Best params: {'batch_size': 32, 'epochs': 20, 'model__act': 'relu', 'model__lr': 0.001, 'model__optimizer': 'RMSprop', 'model__units': 32}
Best CV accuracy 0.8498428213902837


In [9]:
# type(X_train_prepared),getattr(X_train_prepared, "dtype", None)
# getattr(y_train, "dtype", None)
# y_train

In [10]:
model = build_model(optimizer="RMSprop", lr=0.001, units=64, act="relu")
history = model.fit(X_train_prepared, y_train, epochs=20, batch_size=32, validation_split=0.2,verbose=2)

y_train_prob = model.predict(X_train_prepared)
y_test_prob = model.predict(X_test_prepared)

y_train_pred = [round(y[0]) for y in y_train_prob]
y_test_pred = [round(y[0]) for y in y_test_prob]

Epoch 1/20
500/500 - 1s - 3ms/step - accuracy: 0.8186 - loss: 0.3867 - val_accuracy: 0.8382 - val_loss: 0.3413
Epoch 2/20
500/500 - 1s - 2ms/step - accuracy: 0.8374 - loss: 0.3472 - val_accuracy: 0.8445 - val_loss: 0.3322
Epoch 3/20
500/500 - 1s - 2ms/step - accuracy: 0.8404 - loss: 0.3406 - val_accuracy: 0.8462 - val_loss: 0.3275
Epoch 4/20
500/500 - 1s - 2ms/step - accuracy: 0.8426 - loss: 0.3365 - val_accuracy: 0.8505 - val_loss: 0.3244
Epoch 5/20
500/500 - 1s - 2ms/step - accuracy: 0.8447 - loss: 0.3334 - val_accuracy: 0.8535 - val_loss: 0.3219
Epoch 6/20
500/500 - 1s - 2ms/step - accuracy: 0.8452 - loss: 0.3308 - val_accuracy: 0.8528 - val_loss: 0.3203
Epoch 7/20
500/500 - 1s - 2ms/step - accuracy: 0.8463 - loss: 0.3287 - val_accuracy: 0.8545 - val_loss: 0.3188
Epoch 8/20
500/500 - 1s - 1ms/step - accuracy: 0.8482 - loss: 0.3267 - val_accuracy: 0.8543 - val_loss: 0.3176
Epoch 9/20
500/500 - 1s - 1ms/step - accuracy: 0.8491 - loss: 0.3251 - val_accuracy: 0.8553 - val_loss: 0.3166
E

In [13]:
print("=== Training Data ===")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("Confusion Matrix:\n", confusion_matrix(y_train, y_train_pred))
print("Classification Report:\n", classification_report(y_train, y_train_pred))

# Test data
print("=== Test Data ===")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("Classification Report:\n", classification_report(y_test, y_test_pred))

=== Training Data ===
Accuracy: 0.855942797139857
Confusion Matrix:
 [[14348   899]
 [ 1982  2770]]
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.94      0.91     15247
           1       0.75      0.58      0.66      4752

    accuracy                           0.86     19999
   macro avg       0.82      0.76      0.78     19999
weighted avg       0.85      0.86      0.85     19999

=== Test Data ===
Accuracy: 0.8518
Confusion Matrix:
 [[3565  203]
 [ 538  694]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.95      0.91      3768
           1       0.77      0.56      0.65      1232

    accuracy                           0.85      5000
   macro avg       0.82      0.75      0.78      5000
weighted avg       0.85      0.85      0.84      5000



In [14]:
model.summary()