In [35]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import random
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm, ensemble
from sklearn.neural_network import MLPClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')
import itertools
from nonconformist.icp import IcpClassifier
from nonconformist.nc import ClassifierNc, MarginErrFunc, ClassifierAdapter

In [36]:
df = pd.read_csv('../data/german_credit.csv')

In [48]:
features = [
'status_checking_account',
'duration_in_month',
'credit_history',
'purpose',
'savings',
'employement_since',
'installment_rate',
'debters',
'resident_since',
'property',
'age',
'other_installments',
'housing',
'num_credits',
'job',
'num_liable',
'telephone',
'foreign_worker'
]
protected_attribute = "gender"
task_types =  ["is_good_loan", "is_high_credit"]
model_types = ["logistic", "gbm", "nn", "svm", "tree"]
n_train = 800
n_test = 200

In [49]:
def format_data(task):
    df_train, df_test = sklearn.model_selection.train_test_split(df, test_size=0.2, random_state=0)

    X_train, X_test = df_train[features].to_numpy(), df_test[features].to_numpy()
    y_train, y_test = df_train[task].to_numpy(), df_test[task].to_numpy()
    z_train, z_test = df_train[protected_attribute].to_numpy(), df_test[protected_attribute].to_numpy()

    return {'X_tr' : X_train, 'X_test' : X_test, 'y_tr': y_train, 'y_test' : y_test, 'z_tr': z_train, 'z_test': z_test}

In [50]:
def get_partition(data, index=None):
    rng = random.Random(0)
    if not index:
        index = rng.randint(0, int(2) - 1)

    X_train, y_train, z_train = data['X_tr'], data['y_tr'], data['z_tr']
    X_test, y_test, z_test = data['X_test'], data['y_test'], data['z_test']
    y_train = np.expand_dims(y_train, axis = 1)
    z_train = np.expand_dims(z_train, axis = 1)
    data = np.concatenate((X_train, y_train, z_train), axis = 1)
    np.random.RandomState(seed=0).shuffle(data)
    X_train, y_train, z_train = data[:, : -2], data[:, -2], data[:, -1]

    N = len(y_train)
    block_length = int(N // 2)

    start, end = block_length * index, block_length * index + block_length

    return {'X_tr' : X_train[start : end], 'X_test' : X_test, 
            'y_tr': y_train[start : end], 'y_test' : y_test, 
            'z_tr': z_train[start: end], 'z_test' : z_test}


In [51]:
data = format_data("is_good_loan")
data = get_partition(data)

In [41]:
base_clf = LogisticRegression
base_clf = make_pipeline(StandardScaler(), base_clf(max_iter=100, random_state=0))
model = CalibratedClassifierCV(base_clf)
model.fit(data['X_tr'], data['y_tr'])

In [42]:
scores = model.predict_proba(data['X_test'])
scores = [i[1] for i in scores]

In [43]:
icp = IcpClassifier(ClassifierNc(ClassifierAdapter(model), MarginErrFunc()))
icp.calibrate(data["X_cal"], data["y_cal"])
pvals = icp.predict(data["X_test"], None)


In [62]:
partition = data.copy()

In [53]:
kf = KFold(n_splits=5, shuffle=False)

In [77]:
for k, (train, test) in enumerate(kf.split(data["X_tr"])):
    break 

In [80]:
data["X_tr"][train]

array([[ 2.        ,  1.25194728,  2.        , ...,  2.3337012 ,
         0.        ,  0.        ],
       [ 1.        , -0.24073677,  1.        , ..., -0.42807537,
         1.        ,  0.        ],
       [ 0.        , -0.73829812,  1.        , ..., -0.42807537,
         1.        ,  0.        ],
       ...,
       [ 1.        ,  2.24706998,  0.        , ..., -0.42807537,
         0.        ,  0.        ],
       [ 0.        , -0.24073677,  1.        , ..., -0.42807537,
         1.        ,  0.        ],
       [ 2.        , -1.40171325,  0.        , ...,  2.3337012 ,
         1.        ,  0.        ]])

In [81]:
data["y_tr"][train]

array([0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1., 1., 1.,
       0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0.,
       0., 0., 1., 0., 1., 1., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 1.,
       1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1.,
       1., 0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0.,
       1., 1., 0., 0., 0.

In [66]:
np.array(a)

array([[1, 2, 3],
       [2, 3, 4]])

In [68]:
np.mean(a, axis=0)

array([1.5, 2.5, 3.5])

In [69]:
a = {}
b = {}

In [70]:
def myfun():
    return 1,2

In [71]:
a["a"], b["b"] = myfun()

In [73]:
b

{'b': 2}

In [61]:
len(data["X_tr"])

400

In [33]:
icp = IcpClassifier(ClassifierNc(ClassifierAdapter(LogisticRegression()),
                                 MarginErrFunc()))

icp_cv = ClassIcpCvHelper(icp)

scores = cross_val_score(icp_cv,
                         data.data,
                         data.target,
                         iterations=5,
                         folds=5,
                         scoring_funcs=[class_mean_errors, class_avg_c],
                         significance_levels=[0.05, 0.1, 0.2])



TypeError: KFold.__init__() takes from 1 to 2 positional arguments but 3 were given

In [106]:
a = np.array([np.array([1,2,3,4]),np.array([4,5,6,4]), np.array([7,8,9,4])])

In [107]:
a

array([[1, 2, 3, 4],
       [4, 5, 6, 4],
       [7, 8, 9, 4]])

In [85]:
a[1,1]

5

In [88]:
a[:, [0,2]]

array([[1, 3],
       [4, 6],
       [7, 9]])

In [114]:
a = np.array([1,2,3,4,5,6,7
              ,8,9,10])

In [101]:
np.random.shuffle(a)
print(a)

[ 1  8  5  2  9  3  6  7 10  4]


In [113]:
np.arange(4)

array([0, 1, 2, 3])

In [115]:
a[:int(10*(2/3))]

array([1, 2, 3, 4, 5, 6])

In [None]:
a