In [72]:
! wget "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data" -O data/adult.data
! wget "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names" -O data/adult.names
! wget "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test" -O data/adult.test

--2023-04-10 12:31:14--  http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3974305 (3.8M) [application/x-httpd-php]
Saving to: ‘data/adult.data’


2023-04-10 12:31:15 (6.07 MB/s) - ‘data/adult.data’ saved [3974305/3974305]

--2023-04-10 12:31:15--  http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5229 (5.1K) [application/x-httpd-php]
Saving to: ‘data/adult.names’


2023-04-10 12:31:15 (5.05 MB/s) - ‘data/adult.names’ saved [5229/5229]

--2023-04-10 12:31:15--  http://archive.ics.uci.edu/ml/machine-learning-databases/

In [73]:
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold, GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [115]:
column_names = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "label"
]
df = pl.read_csv("data/adult-all.csv", has_header=False, null_values="?")
df.columns = column_names
# drop nulls:
old_len = len(df)
df = df.drop_nulls()
print(f"Dropped rows because of nulls: {old_len - len(df)}")

# Convert to categorical columns from string: 
# df = df.with_columns([
#     pl.col(["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country", "label"]).cast(pl.Categorical),
# ])

Dropped rows because of nulls: 3620


In [116]:
# Counting the class distribution of the label column:
df.groupby("label", maintain_order=True).agg(pl.count())

# We can see that we have a class imbalance here. We can fix this by undersampling the majority class and match the two classes.

label,count
str,u32
"""<=50K""",34014
""">50K""",11208


In [117]:
df_pd = df.to_pandas()
X, y = df_pd.drop("label", axis=1), df_pd["label"]

cat_columns = X.select_dtypes(include=["object", "bool"]).columns
num_columns = X.select_dtypes(include=["int64", "float64"]).columns

print(f"Categorical columns: {cat_columns}\n", "*"*40, f"\nNumerical columns: {num_columns}")
print(f"Y Label column: {y.name}")

# Sanity Check
assert(len(cat_columns) + len(num_columns) == len(X.columns))

Categorical columns: Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country'],
      dtype='object')
 **************************************** 
Numerical columns: Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')
Y Label column: label


In [118]:
# Try several models to pick the best one for our use case: 
model_dict = {
    "rfc" : RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
    "gbc" : GradientBoostingClassifier(n_estimators=100, random_state=42),
    "lrc": LogisticRegression(),
    "dummy": DummyClassifier(strategy="most_frequent")
}

y_encoded = LabelEncoder().fit_transform(y)

In [119]:
from sklearn.model_selection import RepeatedStratifiedKFold

results = {}
for name, model in model_dict.items():
    transformation_steps = [
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_columns),
        ("num", MinMaxScaler(), num_columns)
    ]
    column_transformer = ColumnTransformer(transformers=transformation_steps)
    pipeline = Pipeline(steps=[
        ('col_transformer', column_transformer),
        ('model', model)
    ])
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
    scores = cross_val_score(pipeline, X, y_encoded, scoring="accuracy", cv=cv, n_jobs=-1)
    
    results[name] = scores
    
    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [120]:
print(results)

{'rfc': array([0.82819237, 0.8432283 , 0.83381247, 0.8325962 , 0.83215391,
       0.83250415, 0.83935876, 0.83171163, 0.83856701, 0.83115878,
       0.83349917, 0.82907684, 0.83126935, 0.83547103, 0.83536046]), 'gbc': array([0.85948038, 0.8750691 , 0.86068111, 0.86145511, 0.86001769,
       0.85572139, 0.86733002, 0.86775763, 0.86543565, 0.85990712,
       0.85837479, 0.86323936, 0.86145511, 0.8641088 , 0.86399823]), 'lrc': array([0.84488668, 0.8564953 , 0.84398496, 0.84475896, 0.84564352,
       0.83836374, 0.84754008, 0.84918178, 0.84962406, 0.8487395 ,
       0.84687673, 0.84798231, 0.84719151, 0.84531181, 0.84896064]), 'dummy': array([0.75212825, 0.75212825, 0.75221141, 0.75221141, 0.75210084,
       0.75212825, 0.75212825, 0.75221141, 0.75221141, 0.75210084,
       0.75212825, 0.75212825, 0.75221141, 0.75221141, 0.75210084])}
