In [21]:
import polars as pl
import pandas as pd
from collections import namedtuple
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder 
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from joblib import dump, load

# Step 1: Load and Process Data

1. We start off with just loading our data via our csv files. 
2. The csv file does not have column names so we set those as well.
3. Then we drop some columns that are irrelevant and also drop rows with null values 
4. We then set the right data types for the columns since they don't get inferred correctly 

In [22]:
from sklearn.calibration import LabelEncoder


DATASET_FILE_PATH = "data/adult-all.csv"
COLUMN_NAMES = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "label"
]


def load_data() -> pl.DataFrame:
    df = pl.read_csv(DATASET_FILE_PATH, has_header=False, null_values=["?"])
    df.columns = COLUMN_NAMES
    
    return df


def pre_process_data(df: pl.DataFrame) -> pd.DataFrame:
    # drop unnecessary columns
    # drop nulls 
    # convert to pandas df for sklearn usage

    processed_df = df.drop(["fnlwgt"]) 
    processed_df = processed_df.drop_nulls()
    
    processed_df = processed_df.to_pandas()
    
    return processed_df
    
    
TrainingTuple = namedtuple("training_tuple", ["X_train", "y_train", "X_test", "y_test", "categorical_features", "numerical_features"])
def prepare_data_for_training(df: pd.DataFrame) -> TrainingTuple:
    X, y = df.drop("label", axis=1), df["label"]
    
    cat_columns = X.select_dtypes(include=["object", "bool"]).columns
    num_columns = X.select_dtypes(include=["int64", "float64"]).columns
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return TrainingTuple(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, 
                         categorical_features=cat_columns, numerical_features=num_columns)

def train_model(training_tuple: TrainingTuple) -> Pipeline:    
    encoder = LabelEncoder().fit(training_tuple.y_train)
    encoded_y_train = encoder.transform(training_tuple.y_train)
    encoded_y_test = encoder.transform(training_tuple.y_test)
    
    transformation_steps = [
        ("cat", OneHotEncoder(handle_unknown="ignore"), training_tuple.categorical_features),
        ("num", MinMaxScaler(), training_tuple.numerical_features)
    ]
    
    model = XGBClassifier()
    column_transformer = ColumnTransformer(transformers=transformation_steps)
    
    pipeline = Pipeline(
        steps = [
            ('col_transfomer', column_transformer),
            ('model', model)
        ]
    )
    pipeline.fit(training_tuple.X_train, encoded_y_train)
    score = pipeline.score(training_tuple.X_test, encoded_y_test)
    print(f"Score for our final model on our test set! {score:.3f}")
    
    return pipeline, encoder

In [23]:
def drift_data(df: pd.DataFrame, cols_to_drift: list, seed=42) -> pd.DataFrame:
    rng = np.random.default_rng(seed)
    df_drifted = df.copy()
    for col in cols_to_drift:
        mean_col = df[col].mean()
        std_col = df[col].std()
        df_drifted[col] = df_drifted[col] + rng.normal(mean_col/1.5, std_col, df_drifted.shape[0])
        
    return df_drifted

    

## Put it all together! 

In [24]:
data = load_data()
processed_data = pre_process_data(data)
training_tuple = prepare_data_for_training(processed_data)
pipeline, y_encoder = train_model(training_tuple)

drifted_test_set_x = drift_data(training_tuple.X_test, ["age", "hours-per-week", "capital-gain", "capital-loss"])

drifted_score = pipeline.score(drifted_test_set_x, y_encoder.transform(training_tuple.y_test))
print(f"Score for our final model on our drifted test set! {drifted_score:.3f}")

Score for our final model on our test set! 0.873
Score for our final model on our drifted test set! 0.680


In [25]:
# Save our pipeline and encoder! 
dump(pipeline, "models/adult_pipeline.joblib")
dump(y_encoder, "models/adult_label_encoder.joblib")