In [2]:
import pandas as pd
from scipy.io import arff
import numpy as np
from sklearn.model_selection import train_test_split

dataarff = arff.loadarff("phpMawTba.arff")
adult_census = pd.DataFrame(dataarff[0])
adult_census[list(adult_census.columns.values)] = adult_census[list(adult_census.columns.values)].applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
adult_census = adult_census.drop(columns=["education-num", "fnlwgt"])
data, target = adult_census.drop(columns="class"), adult_census["class"]
for column in data.columns:
    if data[column].dtype == 'float64':
        data[column] = data[column].astype(np.int64)

  adult_census[list(adult_census.columns.values)] = adult_census[list(adult_census.columns.values)].applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)


In [7]:
from sklearn.compose import make_column_selector as selector

num_selector = selector(dtype_exclude='object')
cat_selector = selector(dtype_include="object")

numerical_columns = num_selector(data)
categorical_columns = cat_selector(data)

data[numerical_columns].head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week
0,25,0,0,40
1,38,0,0,50
2,28,0,0,40
3,44,7688,0,40
4,18,0,0,30


In [8]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

In [9]:
from sklearn.compose import ColumnTransformer
preprocesser = ColumnTransformer([
    ("one-hot-encoder", categorical_preprocessor, categorical_columns),
    ("standard-scaler", numerical_preprocessor, numerical_columns)
])

In [11]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

model = make_pipeline(preprocesser, LogisticRegression(max_iter=500))
model

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [14]:
_ = model.fit(X_train, y_train)

In [16]:
X_test.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
7762,56,Private,HS-grad,Divorced,Other-service,Unmarried,White,Female,0,0,40,United-States
23881,25,Private,HS-grad,Married-civ-spouse,Transport-moving,Own-child,Other,Male,0,0,40,United-States
30507,43,Private,Bachelors,Divorced,Prof-specialty,Not-in-family,White,Female,14344,0,40,United-States
28911,32,Private,HS-grad,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States
19484,39,Private,Bachelors,Married-civ-spouse,Sales,Wife,White,Female,0,0,30,United-States


In [17]:
model.predict(X_test)[:5]

array(['<=50K', '<=50K', '>50K', '<=50K', '>50K'], dtype=object)

In [18]:
y_test[:5]

7762     <=50K
23881    <=50K
30507     >50K
28911    <=50K
19484    <=50K
Name: class, dtype: object

In [19]:
model.score(X_test, y_test)

0.8575055278028008

In [20]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, data, target, cv=5)
cv_results

{'fit_time': array([0.49560332, 0.46728706, 0.44830751, 0.54483509, 0.51984239]),
 'score_time': array([0.06759596, 0.05560756, 0.04946041, 0.05082822, 0.04891968]),
 'test_score': array([0.85116184, 0.84993346, 0.8482801 , 0.85257985, 0.85544636])}

In [21]:
scores=cv_results["test_score"]
print(
    "The mean cross-validation accuracy is:"
    f"{scores.mean():.3f} +- {scores.std():.3f}"
)

The mean cross-validation accuracy is:0.851 +- 0.002


In [22]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import OrdinalEncoder

categorical_preprocessor = OrdinalEncoder(
    handle_unknown="use_encoded_value", unknown_value=-1
)

preprocesser = ColumnTransformer(
    [("categorical", categorical_preprocessor, categorical_columns)],
    remainder="passthrough"
)

model = make_pipeline(preprocesser, HistGradientBoostingClassifier())

In [23]:
%%time
_ = model.fit(X_train, y_train)

CPU times: total: 12 s
Wall time: 1.29 s


In [24]:
model.score(X_test,y_test)

0.8807632462533781