In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from scipy.stats import zscore

# Load dataset with column names
df = pd.read_csv('adult_dataset.csv', header=None)

print(df.head())
print(df.isnull().sum())

    0          1       2             3                4                   5   \
0  age  workclass  fnlwgt     education  educational-num      marital-status   
1   25    Private  226802          11th                7       Never-married   
2   38    Private   89814       HS-grad                9  Married-civ-spouse   
3   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
4   44    Private  160323  Some-college               10  Married-civ-spouse   

                  6             7      8       9             10            11  \
0         occupation  relationship   race  gender  capital-gain  capital-loss   
1  Machine-op-inspct     Own-child  Black    Male             0             0   
2    Farming-fishing       Husband  White    Male             0             0   
3    Protective-serv       Husband  White    Male             0             0   
4  Machine-op-inspct       Husband  Black    Male          7688             0   

               12              1

In [5]:
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].map(lambda x: x.strip() if isinstance(x, str) else x)
    
for col in df.select_dtypes(include=[np.number]).columns:
    df = df[df[col] >= 0]
    
df.shape

(45223, 15)

In [6]:
# --- n. Error Correction: Outlier removal using z-score ---
numeric_cols = df.select_dtypes(include=[np.number]).columns
z_scores = np.abs(zscore(df[numeric_cols]))
df = df[(z_scores < 3).all(axis=1)]

df.shape

(45223, 15)

In [13]:
# --- o. Data Transformation ---
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

X = df.drop('income', axis=1)
y = df['income']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [14]:
# --- p. Model Building ---

# Logistic Regression
log_model = LogisticRegression(max_iter=200)
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)
log_acc = accuracy_score(y_test, log_pred) * 100

# Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)
nb_acc = accuracy_score(y_test, nb_pred) * 100

In [15]:
# Print results
print("=== Adult Income Classification ===")
print(f"Regression Method (Logistic Regression) Accuracy: {log_acc:.2f}%")
print(f"Naive Bayes Method Accuracy: {nb_acc:.2f}%")

if log_acc > nb_acc:
    print("→ Logistic Regression performed better.")
elif nb_acc > log_acc:
    print("→ Naive Bayes performed better.")
else:
    print("→ Both models performed equally.")

=== Adult Income Classification ===
Regression Method (Logistic Regression) Accuracy: 83.32%
Naive Bayes Method Accuracy: 79.37%
→ Logistic Regression performed better.
