In [1]:
import os
import sys

from dotenv import load_dotenv

root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if root not in sys.path:
    sys.path.append(root)

from script import setup

load_dotenv()
access_token = os.getenv("HUGGINGFACE_ACCESS_TOKEN")

In [2]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score

dataset = load_dataset("scikit-learn/adult-census-income", split="train")
dataset = dataset.to_pandas()

In [3]:
df, df_test = train_test_split(dataset, test_size=0.3, random_state=42)

df["label"] = df["income"].apply(lambda x: 1 if x == ">50K" else 0)
df_test["label"] = df_test["income"].apply(lambda x: 1 if x == ">50K" else 0)

X_train = df.drop(columns=["income", "label"])
y_train = df["label"]
X_test = df_test.drop(columns=["income", "label"])
y_test = df_test["label"]
numeric_features = [
    "age",
    "education_num",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
]
# Ensure numeric_features and categorical_features match the actual column names
numeric_features = [col for col in numeric_features if col in X_train.columns]
categorical_features = [col for col in X_train.columns if col not in numeric_features]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", XGBClassifier(random_state=42)),
    ]
)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {acc:.4f}")

Test Accuracy: 0.8704
