In [1]:
from aif360.sklearn.datasets import fetch_german

pip install 'aif360[LawSchoolGPA]'


In [2]:
from numpy import mean
from numpy import std

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.svm import SVC

from matplotlib import pyplot


In [3]:
# load the dataset
def load_dataset():
    # load the dataset as a numpy array
    dataset = fetch_german()
    # split into inputs and outputs
    X, y = dataset.X, dataset.y
    # select categorical features
    cat_ix = X.select_dtypes(include=['category']).columns
    num_ix = X.select_dtypes(include=['int64', 'float64']).columns
    # one hot encode cat features only
    # label encode the target variable to have the classes 0 and 1
    y = LabelEncoder().fit_transform(y)
    return X, y, cat_ix, num_ix

In [8]:
X, y, cat_ix, num_ix = load_dataset()

  warn(


In [56]:
# calculate f2 score
def f2(y_true, y_pred):
	return fbeta_score(y_true, y_pred, beta=2)

# evaluate a model
def evaluate_model(X, y, model):
	# define evaluation procedure
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	# define the model evaluation the metric
	metric = make_scorer(f2)
	# evaluate model
	scores = cross_val_score(model, X, y, scoring=metric, cv=cv, n_jobs=-1)
	return scores

In [57]:
# define the reference model
model = DummyClassifier(strategy='constant', constant=1)

In [58]:
# evaluate the model
scores = evaluate_model(X, y, model)
# summarize performance
print('Mean F2: %.3f (%.3f)' % (mean(scores), std(scores)))

Mean F2: 0.921 (0.000)


In [64]:
# define models to test
def get_models():
    models, names = list(), list()
    # LR
    models.append(LogisticRegression(solver='liblinear'))
    names.append('LR')
    # LDA
    models.append(LinearDiscriminantAnalysis())
    names.append('LDA')
    # NB
    models.append(GaussianNB())
    names.append('NB')
    # GPC
    models.append(GaussianProcessClassifier())
    names.append('GPC')
    # SVM
    models.append(SVC(gamma='scale'))
    names.append('SVM')
    return models, names

models, names = get_models()

In [65]:
results = list()
# evaluate each model
for i in range(len(models)):
 # one hot encode categorical, normalize numerical
 ct = ColumnTransformer([('c',OneHotEncoder(),cat_ix), ('n',MinMaxScaler(),num_ix)])
 # wrap the model i a pipeline
 pipeline = Pipeline(steps=[('t',ct),('m',models[i])])
 # evaluate the model and store results
 scores = evaluate_model(X, y, pipeline)
 results.append(scores)
 # summarize and store
 print('>%s %.3f (%.3f)' % (names[i], mean(scores), std(scores)))

>LR 0.862 (0.033)
>LDA 0.855 (0.032)
>NB 0.682 (0.148)
>GPC 0.910 (0.016)
>SVM 0.887 (0.025)


In [13]:
ct = ColumnTransformer([('c',OneHotEncoder(),cat_ix), ('n',MinMaxScaler(),num_ix)])
X = ct.fit_transform(X)

In [27]:
type(X)

numpy.ndarray

In [None]:
total_count = 0
for x in ct.named_transformers_['c'].categories_:
    total_count += len(x)
    print(len(x))
print(total_count)

In [91]:
X, y, cat_ix, num_ix = load_dataset()

  warn(


In [92]:
# transform the age column into zero and one depending on the age being greater than 25
X['age'] = X['age'].apply(lambda x: 0 if x <= 25 else 1)

In [93]:
# transform the sex column into 0 or 1
X['sex'] = X['sex'].apply(lambda x: 0 if x == 'female' else 1).astype(int)

In [94]:
# Calculate the probabilities
prob_age_1 = X['age'].mean()  # P(age = 1)
prob_age_0 = 1 - prob_age_1   # P(age = 0)

prob_sex_1 = X['sex'].mean()  # P(sex = 1)
prob_sex_0 = 1 - prob_sex_1   # P(sex = 0)

print("P(age = 1):", prob_age_1)
print("P(age = 0):", prob_age_0)
print("P(sex = 1):", prob_sex_1)
print("P(sex = 0):", prob_sex_0)

P(age = 1): 0.81
P(age = 0): 0.18999999999999995
P(sex = 1): 0.69
P(sex = 0): 0.31000000000000005


In [96]:
# Calculate the conditional probabilities
prob_y_given_age_0 = y[X['age'] == 0].mean()  # P(y=1|age=0)
prob_y_given_age_1 = y[X['age'] == 1].mean()  # P(y=1|age=1)

prob_y_given_sex_0 = y[X['sex'] == 0].mean()  # P(y=1|sex=0)
prob_y_given_sex_1 = y[X['sex'] == 1].mean()  # P(y=1|sex=1)

print("P(y=1|age=0):", prob_y_given_age_0)
print("P(y=1|age=1):", prob_y_given_age_1)
print("P(y=1|sex=0):", prob_y_given_sex_0)
print("P(y=1|sex=1):", prob_y_given_sex_1)

P(y=1|age=0): 0.5789473684210527
P(y=1|age=1): 0.7283950617283951
P(y=1|sex=0): 0.6483870967741936
P(y=1|sex=1): 0.7231884057971014
