In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import numpy as np

# Load dataset
df = pd.read_csv("adult_dataset.csv")

# m. Data cleaning: remove missing values ('?') and negative numbers
df = df.replace(' ?', pd.NA).dropna()
df = df[(df.select_dtypes(include='number') >= 0).all(axis=1)]

# n. Error correcting: Outlier removal using Z-score method
numeric_cols = df.select_dtypes(include='number').columns
z_scores = np.abs((df[numeric_cols] - df[numeric_cols].mean()) / df[numeric_cols].std())
df = df[(z_scores < 3).all(axis=1)]

# o. Data transformation: Encode categorical variables and scale features
for col in df.select_dtypes(include='object').columns:
    df[col] = LabelEncoder().fit_transform(df[col])

X = df.drop("income", axis=1)
y = df["income"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# p. Build data models and compare accuracy
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_acc = accuracy_score(y_test, lr.predict(X_test))

# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_acc = accuracy_score(y_test, nb.predict(X_test))

# Print results
print(f"Logistic Regression Accuracy: {lr_acc * 100:.2f}%")
print(f"Naive Bayes Accuracy: {nb_acc * 100:.2f}%")
