In [155]:
import numpy as np

In [156]:
n_samples = 10000
np.random.seed(22)

def group_membership(group, category):
    indices = np.where(group == category)
    membership = np.zeros(group.shape[0], dtype=bool)
    membership[indices] = True
    return membership

# Non-group features
features = np.random.uniform(low=0, high=1, size=(n_samples, 20))

# Group features
group_names = ["ALL", "A", "B", "C", "0", "1", "A0", "A1", "B0", "B1", "C0", "C1"]
alpha_names = ["A", "B", "C"]
bin_names = ["0", "1"]
alphas = np.random.choice([0, 1, 2], size=n_samples, p=[.5, .3, .2])
bins = np.random.choice([0, 1], size=n_samples, p=[.6, .4])
alpha_onehot = np.zeros((alphas.size, alphas.max() + 1))
alpha_onehot[np.arange(alphas.size), alphas] = 1
bin_onehot = np.zeros((bins.size, bins.max() + 1))
bin_onehot[np.arange(bins.size), bins] = 1

group_features = np.concatenate((alpha_onehot, bin_onehot), axis=1)

# Group memberships
group_memberships = []
group_memberships.append([True] * n_samples)

# Groups
group_memberships.append(group_membership(alphas, 0))
group_memberships.append(group_membership(alphas, 1))
group_memberships.append(group_membership(alphas, 2))
group_memberships.append(group_membership(bins, 0))
group_memberships.append(group_membership(bins, 1))

# Group intersections
for alpha in [0, 1, 2]:
    for bina in [0, 1]:
        alpha_membership = group_membership(alphas, alpha)
        bin_membership = group_membership(bins, bina)
        group_memberships.append(alpha_membership & bin_membership)

# Concatenate to get dataset
X = np.concatenate((features, group_features), axis=1)
n_features = X.shape[1]
alpha_features = [20, 21, 22]
bin_features = [23, 24]

In [157]:
# Generate weight vectors for each group
alpha_weights = []
bin_weights = []
for i in range(3):
    alpha_weights.append(np.random.dirichlet(np.ones(n_features) * (1/100)))

#for i in range(2):
#    bin_weights.append(np.random.dirichlet(np.ones(n_features) * (1/100)))

# Generate alpha labels
y_alpha = np.zeros(n_samples)
y_alpha[group_memberships[1]] = X[group_memberships[1]] @ alpha_weights[0]
y_alpha[group_memberships[2]] = X[group_memberships[2]] @ alpha_weights[1]
y_alpha[group_memberships[3]] = X[group_memberships[3]] @ alpha_weights[2]

# Generate bin labels
#y_bin = np.zeros(n_samples)
#y_bin[group_memberships[4]] = X[group_memberships[4]] @ bin_weights[0]
#y_bin[group_memberships[5]] = X[group_memberships[5]] @ bin_weights[1]

# Generate final labels
# TODO: other aggregation schemes
y = y_alpha
y[np.where(y > 0.5)] = 1
y[np.where(y <= 0.5)] = 0

In [158]:
# Train some models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

modelA = LogisticRegression(penalty='l2')
model0 = LogisticRegression(penalty='l2')
modelA.fit(X[group_memberships[1]], y[group_memberships[1]])
model0.fit(X[group_memberships[4]], y[group_memberships[4]])

predA = modelA.predict(X[group_memberships[1]])
pred0 = model0.predict(X[group_memberships[4]])
print(accuracy_score(y[group_memberships[1]], predA))
print(accuracy_score(y[group_memberships[4]], pred0))

predA = modelA.predict(X[group_memberships[6]]) #A0
pred0 = model0.predict(X[group_memberships[6]])
print(accuracy_score(predA, pred0))

0.9938843953442493
0.8165866578381062
0.8418467583497053


In [159]:
treeA = DecisionTreeClassifier()
tree0 = DecisionTreeClassifier()

treeA.fit(X[group_memberships[1]], y[group_memberships[1]])
tree0.fit(X[group_memberships[4]], y[group_memberships[4]])

predA = treeA.predict(X[group_memberships[1]])
pred0 = tree0.predict(X[group_memberships[4]])

print(accuracy_score(y[group_memberships[1]], predA))
print(accuracy_score(y[group_memberships[4]], pred0))
print(accuracy_score(treeA.predict(X[group_memberships[6]]), tree0.predict(X[group_memberships[6]])))

1.0
1.0
1.0


In [160]:
# Ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

tree = DecisionTreeClassifier()
gbc = GradientBoostingClassifier()
forest = RandomForestClassifier(n_estimators=30)
tree.fit(X, y)
forest.fit(X, y)
gbc.fit(X, y)


for i in range(12):
    print(accuracy_score(y[group_memberships[i]], gbc.predict(X[group_memberships[i]])))

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0


In [163]:
# Non-group features
features = np.random.uniform(low=0, high=1, size=(n_samples, 20))

# Group features
group_names = ["ALL", "A", "B", "C", "0", "1", "A0", "A1", "B0", "B1", "C0", "C1"]
alpha_names = ["A", "B", "C"]
bin_names = ["0", "1"]
alphas = np.random.choice([0, 1, 2], size=n_samples, p=[.5, .3, .2])
bins = np.random.choice([0, 1], size=n_samples, p=[.6, .4])
alpha_onehot = np.zeros((alphas.size, alphas.max() + 1))
alpha_onehot[np.arange(alphas.size), alphas] = 1
bin_onehot = np.zeros((bins.size, bins.max() + 1))
bin_onehot[np.arange(bins.size), bins] = 1

group_features = np.concatenate((alpha_onehot, bin_onehot), axis=1)

X = np.concatenate((features, group_features), axis=1)

# Group memberships
group_memberships = []
group_memberships.append([True] * n_samples)

# Groups
group_memberships.append(group_membership(alphas, 0))
group_memberships.append(group_membership(alphas, 1))
group_memberships.append(group_membership(alphas, 2))
group_memberships.append(group_membership(bins, 0))
group_memberships.append(group_membership(bins, 1))

# Group intersections
for alpha in [0, 1, 2]:
    for bina in [0, 1]:
        alpha_membership = group_membership(alphas, alpha)
        bin_membership = group_membership(bins, bina)
        group_memberships.append(alpha_membership & bin_membership)
        
# Generate alpha labels
y_alpha = np.zeros(n_samples)
y_alpha[group_memberships[1]] = X[group_memberships[1]] @ alpha_weights[0]
y_alpha[group_memberships[2]] = X[group_memberships[2]] @ alpha_weights[1]
y_alpha[group_memberships[3]] = X[group_memberships[3]] @ alpha_weights[2]

# Generate bin labels
#y_bin = np.zeros(n_samples)
#y_bin[group_memberships[4]] = X[group_memberships[4]] @ bin_weights[0]
#y_bin[group_memberships[5]] = X[group_memberships[5]] @ bin_weights[1]

# Generate final labels
# TODO: other aggregation schemes
y = y_alpha
y[np.where(y > 0.5)] = 1
y[np.where(y <= 0.5)] = 0

In [164]:
for i in range(12):
    print(accuracy_score(y[group_memberships[i]], gbc.predict(X[group_memberships[i]])))
print()
for i in range(12):
    print(accuracy_score(y[group_memberships[i]], tree.predict(X[group_memberships[i]])))
print()
print(accuracy_score(y[group_memberships[1]], treeA.predict(X[group_memberships[1]])))
print(accuracy_score(y[group_memberships[4]], tree0.predict(X[group_memberships[4]])))

0.9998
1.0
0.9993192648059904
1.0
0.9998318479905834
0.999753269183321
1.0
1.0
0.9994305239179955
0.9991539763113367
1.0
1.0

0.9999
1.0
0.9996596324029953
1.0
1.0
0.999753269183321
1.0
1.0
1.0
0.9991539763113367
1.0
1.0

1.0
1.0
