In [1]:
! wget "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data" -O data/adult.data
! wget "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names" -O data/adult.names
! wget "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test" -O data/adult.test

--2023-04-10 20:51:01--  http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3974305 (3.8M) [application/x-httpd-php]
Saving to: ‘data/adult.data’


2023-04-10 20:51:02 (6.02 MB/s) - ‘data/adult.data’ saved [3974305/3974305]

--2023-04-10 20:51:02--  http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5229 (5.1K) [application/x-httpd-php]
Saving to: ‘data/adult.names’


2023-04-10 20:51:02 (92.3 MB/s) - ‘data/adult.names’ saved [5229/5229]

--2023-04-10 20:51:02--  http://archive.ics.uci.edu/ml/machine-learning-databases/

In [1]:
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer, 
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold, GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [38]:
column_names = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "label"
]
df = pl.read_csv("data/adult-all.csv", has_header=False, null_values="?")
df.columns = column_names
# drop nulls:
old_len = len(df)
df = df.drop_nulls()
print(f"Dropped rows because of nulls: {old_len - len(df)}")


# drop fnlwgt column:

df = df.drop(["fnlwgt"])
# Convert to categorical columns from string: 
# df = df.with_columns([
#     pl.col(["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country", "label"]).cast(pl.Categorical),
# ])

Dropped rows because of nulls: 3620


In [39]:
# Counting the class distribution of the label column:
df.groupby("label", maintain_order=True).agg(pl.count())

# We can see that we have a class imbalance here. We can fix this by undersampling the majority class and match the two classes.

label,count
str,u32
"""<=50K""",34014
""">50K""",11208


In [40]:
df_pd = df.to_pandas()
X, y = df_pd.drop("label", axis=1), df_pd["label"]

cat_columns = X.select_dtypes(include=["object", "bool"]).columns
num_columns = X.select_dtypes(include=["int64", "float64"]).columns

print(f"Categorical columns: {cat_columns}\n", "*"*40, f"\nNumerical columns: {num_columns}")
print(f"Y Label column: {y.name}")

# Sanity Check
assert(len(cat_columns) + len(num_columns) == len(X.columns))

Categorical columns: Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country'],
      dtype='object')
 **************************************** 
Numerical columns: Index(['age', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')
Y Label column: label


In [41]:
# Try several models to pick the best one for our use case: 
model_dict = {
    "rfc" : RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
    "gbc" : GradientBoostingClassifier(n_estimators=100, random_state=42),
    "lrc": LogisticRegression(),
    "xgb": XGBClassifier(),
    "dummy": DummyClassifier(strategy="most_frequent"),
}

y_encoded = LabelEncoder().fit_transform(y)

In [42]:
# from sklearn.model_selection import RepeatedStratifiedKFold

# results = {}
# for name, model in model_dict.items():
#     transformation_steps = [
#         ("cat", OneHotEncoder(handle_unknown="ignore"), cat_columns),
#         ("num", MinMaxScaler(), num_columns)
#     ]
#     column_transformer = ColumnTransformer(transformers=transformation_steps)
#     pipeline = Pipeline(steps=[
#         ('col_transformer', column_transformer),
#         ('model', model)
#     ])
#     cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
#     scores = cross_val_score(pipeline, X, y_encoded, scoring="accuracy", cv=cv, n_jobs=-1)
    
#     results[name] = scores
    
    

In [43]:
# for k, v in results.items():
#     print(f"{k}: {v.mean():.3f} ({v.std():.3f})")

## Main Model Training: 

In [44]:
# We can see that our Xgboost classifier performs the best. Lets use that with a training and test set. 
# We will then mutate the test set with drift to see how it affects the performance of our model! 

# We will need the label encoder later! 
label_encoder = LabelEncoder().fit(y)

y_encoded = label_encoder.transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
transformation_steps = [
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_columns),
    ("num", MinMaxScaler(), num_columns)
]
model = XGBClassifier()
column_transformer = ColumnTransformer(transformers=transformation_steps)

pipeline = Pipeline(steps=[
    ('col_transformer', column_transformer),
    ('model', model)
])

pipeline.fit(X_train, y_train)
score = pipeline.score(X_test, y_test)
print(f"Score of our final model on our test set: {score:.3f}")

# X_test.to_csv("data/x_adult_test_stream.csv", index=False)
# pd.DataFrame(y_test).to_csv("data/y_adult_test_stream.csv", index=False)

Score of our final model on our test set: 0.868


In [87]:
import pickle
from joblib import dump, load
dump (pipeline, "models/adult_pipeline.joblib")
dump (label_encoder, "models/adult_label_encoder.joblib")

['models/adult_label_encoder.joblib']

## Add Data Drift to our test set: 

In [45]:
# from numpy import interp
# import strlearn as sl
# X_num = X[num_columns]

# stream = sl.streams.SemiSyntheticStreamGenerator(
#     X_num,
#     y_encoded,
#     n_drifts=4,
#     interpolation="cubic",
# )

# clf = pipeline
# evaluator = sl.evaluators.TestThenTrain(metrics=["accuracy"])
# evaluator.process(stream, clf)

Well that was a failed experiment. So streams does not support non-numeric types. As such, we need to go do data drifts manually! 

In [46]:
rng = np.random.default_rng(42)

# Introduce drift to our test set to see how our model performs on it first: 
# We will do this by introducng gaussian noise to the numerical columns of our test set.

# Select a subset of the numerical columns to introduce drift to: 
drift_columns = num_columns[1:3] # We will introduce drift to the second and third numerical columns

# This will essentially change P(X)
X_test_drifted = X_test.copy()
for col in drift_columns:
    mean_col = X_test_drifted[col].mean()
    X_test_drifted[col] += np.round(rng.normal(mean_col/2, mean_col/3, size=len(X_test_drifted)))

In [47]:
X_test_drifted

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
36919,27,Private,Some-college,16.0,Never-married,Adm-clerical,Own-child,White,Female,496.0,0,40,Nicaragua
17947,43,Local-gov,Prof-school,17.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,15912.0,0,45,United-States
3173,49,Local-gov,Bachelors,21.0,Divorced,Prof-specialty,Unmarried,Black,Female,1863.0,0,40,United-States
35365,39,Local-gov,Assoc-voc,19.0,Married-civ-spouse,Protective-serv,Husband,White,Male,724.0,0,40,United-States
13386,32,Private,Some-college,8.0,Separated,Exec-managerial,Unmarried,Black,Female,289.0,0,40,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29647,32,Private,Assoc-acdm,19.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,-69.0,0,62,United-States
38393,64,Without-pay,HS-grad,18.0,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,1137.0,0,60,United-States
5522,22,Private,HS-grad,10.0,Never-married,Handlers-cleaners,Own-child,White,Male,1264.0,0,40,United-States
43839,51,Private,HS-grad,12.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,781.0,0,40,United-States


In [48]:
X_test

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
36919,27,Private,Some-college,10,Never-married,Adm-clerical,Own-child,White,Female,0,0,40,Nicaragua
17947,43,Local-gov,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,15024,0,45,United-States
3173,49,Local-gov,Bachelors,13,Divorced,Prof-specialty,Unmarried,Black,Female,0,0,40,United-States
35365,39,Local-gov,Assoc-voc,11,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
13386,32,Private,Some-college,10,Separated,Exec-managerial,Unmarried,Black,Female,0,0,40,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29647,32,Private,Assoc-acdm,12,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,62,United-States
38393,64,Without-pay,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,60,United-States
5522,22,Private,HS-grad,9,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,40,United-States
43839,51,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States


In [49]:
updated_score = pipeline.score(X_test_drifted, y_test)
print(f"Score of our final model on our test set after drift: {updated_score:.3f}")

Score of our final model on our test set after drift: 0.805


In [50]:
from evidently.metric_preset import DataDriftPreset
from evidently.report import Report
from evidently.pipeline.column_mapping import ColumnMapping

In [89]:
data_columns = ColumnMapping()

data_columns.numerical_features = num_columns.to_list()
data_columns.categorical_features = cat_columns.to_list()

AttributeError: 'ColumnMapping' object has no attribute 'to_pickle'

In [91]:
import pickle
with open('models/data_columns.pickle', 'wb') as out_file:
    pickle.dump(data_columns, out_file)

In [52]:
drift_report = Report(metrics=[DataDriftPreset()])
drift_report.run(
    reference_data=X_train,
    current_data=X_test_drifted,
    column_mapping=data_columns,
)

In [88]:
X_train.to_csv("data/x_adult_train.csv", index=False)

In [53]:
rp = drift_report.as_pandas()

In [54]:
rp.keys()

dict_keys(['DatasetDriftMetric', 'DataDriftTable'])

In [55]:
rp['DatasetDriftMetric']

Unnamed: 0,drift_share,number_of_columns,number_of_drifted_columns,share_of_drifted_columns,dataset_drift
0,0.5,13,1,0.076923,False


In [56]:
d = drift_report.as_dict()

In [57]:
d['metrics'][1]["result"]["drift_by_columns"]

{'age': {'column_name': 'age',
  'column_type': 'num',
  'stattest_name': 'Wasserstein distance (normed)',
  'stattest_threshold': 0.1,
  'drift_score': 0.012792578226012617,
  'drift_detected': False,
  'current': {'small_distribution': {'x': [17.0,
     24.3,
     31.6,
     38.9,
     46.2,
     53.5,
     60.8,
     68.1,
     75.4,
     82.7,
     90.0],
    'y': [0.023186957147292454,
     0.024322830292979544,
     0.026549141658526253,
     0.02623109717773384,
     0.017325851715547083,
     0.010677207569458648,
     0.006118570011434459,
     0.0016659472803410623,
     0.0005906540357572868,
     0.00031804448079238525]}},
  'reference': {'small_distribution': {'x': [17.0,
     24.3,
     31.6,
     38.9,
     46.2,
     53.5,
     60.8,
     68.1,
     75.4,
     82.7,
     90.0],
    'y': [0.021874944384932378,
     0.025067012606586867,
     0.026320363236916222,
     0.026805042634747485,
     0.017312142241286285,
     0.010996163838297325,
     0.006043346241708861,
 

We see a reduction in our results as expected. Now lets try some of the various libraries to see how well they work at detecting this kind of drift: 

In [58]:
from frouros.detectors.concept_drift import DDM, DDMConfig
from frouros.detectors.data_drift import KSTest, MMD, JS
from frouros.callbacks.batch import PermutationTestOnBatchData

# # Set significance level for hypothesis testing
alpha = 0.001
# Define and fit detector
detector = KSTest()

drift_column_index = [X_test.columns.get_loc(c) for c in drift_columns]


detector.fit(X=X_train.to_numpy()[:, drift_column_index[0]])
# Apply detector to the selected feature of X_test
result = detector.compare(X=X_test_drifted.to_numpy()[:, drift_column_index[0]])

# Check if drift is taking place
result[0].p_value < alpha

True

In [59]:
# Create a KS detector for each of the numerical columns and use those to detect drift in the test set.
ks_detectors = {}
for i in range(len(num_columns)):
    ks_detectors[num_columns[i]] = KSTest()
    ks_detectors[num_columns[i]].fit(X=X_train.to_numpy()[:, i])
    

In [60]:
detector_mmd = MMD(
    callbacks=[
        PermutationTestOnBatchData(
            num_permutations=10,
            random_state=42,
            num_jobs=-1,
            name="mmd",
            verbose=False
        )
    ]
)

_ = detector_mmd.fit(X=X_train[drift_columns].to_numpy())

alpha = 0.05

p_value_test = detector_mmd.compare(X=X_test[drift_columns].to_numpy())
p_value_drift = detector_mmd.compare(X=X_test_drifted[drift_columns].to_numpy())

KeyboardInterrupt: 

In [None]:
detector_js = JS()
detector_js.fit(X=X_train[cat_columns[0]].to_numpy())

{}

In [42]:
detector_js.compare(X=X_test_drifted[cat_columns[0]].to_numpy())

TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [40]:
detector_js.compare(X=X_test[drift_columns[0]].to_numpy())

(DistanceResult(distance=0.012315514836215725), {})

In [33]:
detector_mmd.compare(X=X_test[drift_columns].to_numpy())

(DistanceResult(distance=0.0001380477288249712),
 {'mmd': {'observed_statistic': 0.0001380477288249712,
   'permuted_statistics': [0.0001382741105307116,
    0.0001389146841483667,
    0.00013812228686629812,
    0.0001381366784089614,
    0.0001386379574237703,
    0.0001364350327053839,
    0.0001386731864867934,
    0.00013823247529990259,
    0.00013768108466246637,
    0.0001384197855559166],
   'p_value': 0.8}})

In [16]:
p_value_test

(DistanceResult(distance=0.0001380477288249712),
 {'mmd': {'observed_statistic': 0.00015340048651017605,
   'permuted_statistics': [0.00014303627244862175,
    0.00014368823742172385,
    0.00014343325418966866,
    0.00014378314855220916,
    0.00014364316474629744,
    0.00014213944944672753,
    0.00014373311528768104,
    0.0001434274212673114,
    0.0001428448147722679,
    0.00014341464180553666],
   'p_value': 0.0}})

In [17]:
p_value_drift

(DistanceResult(distance=0.00015340048651017605),
 {'mmd': {'observed_statistic': 0.00015340048651017605,
   'permuted_statistics': [0.00014303627244862175,
    0.00014368823742172385,
    0.00014343325418966866,
    0.00014378314855220916,
    0.00014364316474629744,
    0.00014213944944672753,
    0.00014373311528768104,
    0.0001434274212673114,
    0.0001428448147722679,
    0.00014341464180553666],
   'p_value': 0.0}})

In [None]:
import pickle

ADULT_MODEL_PATH = "models/adult_model.pkl"
KS_DETECTORS_PATH = "models/ks_detectors.pkl"
MMD_DETECTOR_PATH = "models/mmd_detector.pkl"

In [61]:
X_test

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
36919,27,Private,Some-college,10,Never-married,Adm-clerical,Own-child,White,Female,0,0,40,Nicaragua
17947,43,Local-gov,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,15024,0,45,United-States
3173,49,Local-gov,Bachelors,13,Divorced,Prof-specialty,Unmarried,Black,Female,0,0,40,United-States
35365,39,Local-gov,Assoc-voc,11,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
13386,32,Private,Some-college,10,Separated,Exec-managerial,Unmarried,Black,Female,0,0,40,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29647,32,Private,Assoc-acdm,12,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,62,United-States
38393,64,Without-pay,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,60,United-States
5522,22,Private,HS-grad,9,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,40,United-States
43839,51,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States


In [34]:
X_test.workclass.unique()

array(['Private', 'Local-gov', 'Self-emp-not-inc', 'Federal-gov',
       'State-gov', 'Self-emp-inc', 'Without-pay'], dtype=object)

In [62]:
X_test.education.unique()

array(['Some-college', 'Prof-school', 'Bachelors', 'Assoc-voc',
       'Doctorate', 'HS-grad', 'Masters', '10th', 'Assoc-acdm', '9th',
       '5th-6th', '11th', '7th-8th', '12th', '1st-4th', 'Preschool'],
      dtype=object)

In [77]:
X_test["capital-loss"].mean()

84.55776672194582

In [78]:
X_test["capital-loss"].max()

3770

In [81]:
X_test["hours-per-week"].min()

2

In [68]:
X_test["marital-status"].unique().tolist()

['Never-married',
 'Married-civ-spouse',
 'Divorced',
 'Separated',
 'Widowed',
 'Married-spouse-absent',
 'Married-AF-spouse']

In [69]:
X_test.occupation.unique().tolist()

['Adm-clerical',
 'Prof-specialty',
 'Protective-serv',
 'Exec-managerial',
 'Craft-repair',
 'Sales',
 'Other-service',
 'Handlers-cleaners',
 'Transport-moving',
 'Machine-op-inspct',
 'Tech-support',
 'Farming-fishing',
 'Priv-house-serv',
 'Armed-Forces']

In [73]:
X_test.sex.unique().tolist()

['Female', 'Male']

In [82]:
X_test["native-country"].unique().tolist()

['Nicaragua',
 'United-States',
 'Jamaica',
 'Italy',
 'Germany',
 'Cuba',
 'Scotland',
 'Canada',
 'Philippines',
 'Mexico',
 'India',
 'Japan',
 'Cambodia',
 'Dominican-Republic',
 'Iran',
 'Ireland',
 'Vietnam',
 'El-Salvador',
 'Puerto-Rico',
 'Hungary',
 'Guatemala',
 'China',
 'Outlying-US(Guam-USVI-etc)',
 'Laos',
 'Peru',
 'France',
 'England',
 'Ecuador',
 'Columbia',
 'Thailand',
 'Poland',
 'Portugal',
 'Taiwan',
 'Greece',
 'South',
 'Haiti',
 'Honduras',
 'Yugoslavia',
 'Hong',
 'Trinadad&Tobago']

In [86]:
X_test_drifted.to_csv("data/x_adult_test_drifted.csv", index=False)