In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle

In [3]:
# Load the CSV file
file_path = 'database.csv'
data = pd.read_csv(file_path)

data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational_grade,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_worked_per_week,native_country,annual_income,censusid
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K,1
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K,2
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K,3
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K,4
4,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K,5


In [4]:
# Drop unnecessary columns
data = data.drop(columns=['censusid', 'fnlwgt'])

In [5]:
# Encode categorical variables
categorical_cols = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'native_country', 'annual_income']

In [6]:
# Initialize label encoder
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

In [7]:
# Separate features and target variable
X = data.drop(columns=['annual_income'])
y = data['annual_income']

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Initialize the Random Forest Classifier
model = RandomForestClassifier(random_state=42)

In [11]:
# Train the model
model.fit(X_train, y_train)

In [12]:
# Make predictions
y_pred = model.predict(X_test)

In [14]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.8393587617468214
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.91      0.90      6827
           1       0.70      0.61      0.65      2218

    accuracy                           0.84      9045
   macro avg       0.79      0.76      0.77      9045
weighted avg       0.83      0.84      0.84      9045



In [15]:
pip install keras-tuner

Note: you may need to restart the kernel to use updated packages.




In [17]:
import tensorflow as tf
from tensorflow import keras
from keras_tuner import HyperModel, RandomSearch
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import pickle

In [18]:
# Preprocess the data: scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [24]:
# Define a hypermodel class for the tuner
class IncomeHyperModel(HyperModel):
    def build(self, hp):
        model = keras.Sequential()
        model.add(keras.layers.InputLayer(input_shape=(X_train_scaled.shape[1],)))
        
        # Tune the number of layers
        for i in range(hp.Int('num_layers', 1, 3)):
            model.add(keras.layers.Dense(units=hp.Int(f'units_{i}', min_value=32, max_value=256, step=32),
                                         activation='relu'))
            model.add(keras.layers.Dropout(rate=hp.Float(f'dropout_{i}', min_value=0.2, max_value=0.5, step=0.1)))
        
        model.add(keras.layers.Dense(1, activation='sigmoid'))
        
        model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG')),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        return model

In [21]:
# Initialize the hypermodel
hypermodel = IncomeHyperModel()

In [22]:
# Initialize the tuner
tuner = RandomSearch(
    hypermodel,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=2,
    directory='income_tuner',
    project_name='income_prediction'
)



In [23]:
# Search for the best hyperparameters
tuner.search(X_train_scaled, y_train, epochs=10, validation_split=0.2)

Trial 10 Complete [00h 00m 39s]
val_accuracy: 0.8431453704833984

Best val_accuracy So Far: 0.8484660089015961
Total elapsed time: 00h 07m 06s


In [25]:
# Get the optimal hyperparameters
best_hyper = tuner.get_best_hyperparameters(num_trials=1)[0]
best_hyper.values

{'num_layers': 2,
 'units_0': 96,
 'dropout_0': 0.30000000000000004,
 'learning_rate': 0.0010088016459101723,
 'units_1': 32,
 'dropout_1': 0.2}

In [27]:
# Build the best model
best_model = tuner.hypermodel.build(best_hyper)

# Train the model
best_model.fit(X_train_scaled, y_train, epochs=10, validation_split=0.2)

# Make predictions
y_pred = (best_model.predict(X_test_scaled) > 0.5).astype("int32")

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Test Accuracy:", accuracy)
print("Classification Report:\n", report)

Epoch 1/10
[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8029 - loss: 0.4180 - val_accuracy: 0.8318 - val_loss: 0.3444
Epoch 2/10
[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8362 - loss: 0.3525 - val_accuracy: 0.8437 - val_loss: 0.3347
Epoch 3/10
[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8382 - loss: 0.3468 - val_accuracy: 0.8398 - val_loss: 0.3332
Epoch 4/10
[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8419 - loss: 0.3420 - val_accuracy: 0.8455 - val_loss: 0.3317
Epoch 5/10
[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8445 - loss: 0.3367 - val_accuracy: 0.8448 - val_loss: 0.3322
Epoch 6/10
[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8458 - loss: 0.3309 - val_accuracy: 0.8451 - val_loss: 0.3321
Epoch 7/10
[1m905/905[0m 

In [28]:
# Save the tuned model
best_model.save('model.h5')

# Save the scaler
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)



In [29]:
# Save the label encoders
with open('label_encoders.pkl', 'wb') as le_file:
    pickle.dump(label_encoders, le_file)