In [None]:
# Step 1: Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler

In [None]:
# Step 2: Load Dataset
dataset = pd.read_csv('adult.csv')
print(dataset.head())
print('Rows:', dataset.shape[0], 'Columns:', dataset.shape[1])

   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18          ?  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                  ?    Own-child  White  Female             0             0   

   hours-per-week native-country incom

In [None]:
# Step 3: Data Info and Null Checks
dataset.info()
print(dataset.describe().T)

# Check for null values and missing '?' values
print(round((dataset.isnull().sum() / dataset.shape[0]) * 100, 2).astype(str) + ' %')
print(round((dataset.isin(['?']).sum() / dataset.shape[0]) * 100, 2).astype(str) + ' %')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB
                   count           mean            std      min       25%  \

In [None]:
# Step 4: Check Target Balance
income_dist = dataset['income'].value_counts(normalize=True)
print(round(income_dist * 100, 2).astype(str) + ' %')

income
<=50K    76.07 %
>50K     23.93 %
Name: proportion, dtype: object


In [27]:
# Step 5: Replace '?' with np.nan and handle missing values
dataset = dataset.replace('?', np.nan)
columns_with_nan = dataset.columns[dataset.isnull().any()].tolist()

for col in columns_with_nan:
    dataset[col] = dataset[col].fillna(dataset[col].mode()[0])


In [None]:
# Step 6: Encode Categorical Features
for col in dataset.columns:
    if dataset[col].dtypes == 'object':
        encoder = LabelEncoder()
        dataset[col] = encoder.fit_transform(dataset[col])

In [None]:
# Step 7: Feature and Target Split
X = dataset.drop('income', axis=1)
Y = dataset['income']

In [None]:
# Step 8: Feature Importance (Optional but helpful)
selector = ExtraTreesClassifier(random_state=42)
selector.fit(X, Y)
feature_imp = selector.feature_importances_
for index, val in enumerate(feature_imp):
    print(f"{X.columns[index]}: {round(val * 100, 2)}%")

age: 15.75%
workclass: 3.88%
fnlwgt: 17.64%
education: 3.78%
educational-num: 8.34%
marital-status: 7.27%
occupation: 7.05%
relationship: 9.01%
race: 1.41%
gender: 2.96%
capital-gain: 8.9%
capital-loss: 2.89%
hours-per-week: 9.58%
native-country: 1.54%


In [None]:
# Step 9: Drop Less Important Features (safely)
cols_to_drop = ['workclass', 'education', 'race', 'sex', 'capital.loss', 'native.country']
existing_cols_to_drop = list(set(cols_to_drop) & set(X.columns))
X = X.drop(existing_cols_to_drop, axis=1)
print("Dropped columns:", existing_cols_to_drop)

Dropped columns: ['education', 'race', 'workclass']


In [None]:
# Step 10: Feature Scaling
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [None]:
# Step 11: Handle Class Imbalance
ros = RandomOverSampler(random_state=42)
X_resampled, Y_resampled = ros.fit_resample(X, Y)
print(round(Y_resampled.value_counts(normalize=True) * 100, 2).astype(str) + ' %')

income
0    50.0 %
1    50.0 %
Name: proportion, dtype: object


In [None]:
# Step 12: Train-Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42)
print("X_train:", X_train.shape, "X_test:", X_test.shape)

X_train: (59448, 11) X_test: (14862, 11)


In [None]:
# Step 13: Base Random Forest Model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, Y_train)
Y_pred_rf = rf.predict(X_test)

print("Random Forest Classifier (Base Model):")
print("Accuracy:", round(accuracy_score(Y_test, Y_pred_rf) * 100, 2))
print("F1 Score:", round(f1_score(Y_test, Y_pred_rf) * 100, 2))

Random Forest Classifier (Base Model):
Accuracy: 93.48
F1 Score: 93.76


In [None]:
# Step 14: Hyperparameter Tuning
n_estimators = [int(x) for x in np.linspace(40, 150, 15)]
max_depth = [int(x) for x in np.linspace(40, 150, 15)]
param_dist = {'n_estimators': n_estimators, 'max_depth': max_depth}

rf_tuned = RandomForestClassifier(random_state=42)
rf_cv = RandomizedSearchCV(rf_tuned, param_distributions=param_dist, cv=5, random_state=42, n_iter=10)
rf_cv.fit(X_train, Y_train)

print("Best Cross-Validation Score:", round(rf_cv.best_score_ * 100, 2))
print("Best Parameters:", rf_cv.best_params_)

In [25]:
# Step 15: Final Model with Best Params
best_params = rf_cv.best_params_
rf_best = RandomForestClassifier(**best_params, random_state=42)
rf_best.fit(X_train, Y_train)
Y_pred_best = rf_best.predict(X_test)

print("Random Forest Classifier (Tuned):")
print("Accuracy:", round(accuracy_score(Y_test, Y_pred_best) * 100, 2))
print("F1 Score:", round(f1_score(Y_test, Y_pred_best) * 100, 2))

Random Forest Classifier (Tuned):
Accuracy: 93.47
F1 Score: 93.75


In [26]:
# Step 16: Confusion Matrix & Classification Report
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_pred_best))
print("Classification Report:\n", classification_report(Y_test, Y_pred_best))

Confusion Matrix:
 [[6612  822]
 [ 148 7280]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.89      0.93      7434
           1       0.90      0.98      0.94      7428

    accuracy                           0.93     14862
   macro avg       0.94      0.93      0.93     14862
weighted avg       0.94      0.93      0.93     14862

