# Performative Prediciton: A Case Study in Strategic Classification

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
from utils.optimization import logistic_regression

import whynot as wn
import whynot.gym as gym

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1) Set up Repeated Classification Environment

We perform our experiments on the *Give me a credit* dataset available at https://www.kaggle.com/c/GiveMeSomeCredit. The dataset contains features of individuals and an indicator variable on whether they defaulted on a loan or not.

In [4]:
# Make the Credit environment and set random seed.
np.random.seed(0)
env = gym.make('Credit-v0')
env.seed(1)

[1]

In [5]:
env.reset()[0]

array([[-0.0227904 ,  0.81170814, -0.10910125, ..., -0.05447654,
        -0.74176455,  1.        ],
       [-0.02210369,  0.46512671, -0.10910125, ...,  0.23555616,
        -0.74176455,  1.        ],
       [-0.02173707, -0.71325019,  0.17667678, ..., -0.05447654,
        -0.74176455,  1.        ],
       ...,
       [-0.02268837,  0.04922898, -0.10910125, ..., -0.05447654,
        -0.74176455,  1.        ],
       [-0.02225384, -0.78256647, -0.10910125, ..., -0.05447654,
         0.9998099 ,  1.        ],
       [-0.02228875, -1.05983163,  0.46245481, ..., -0.05447654,
         0.12902268,  1.        ]])

## 2) Simulation

### Initial ERM Classifier

We train a logistic regression classifier on the given set of features and labels. 
This corresponds to the classical supervised learning procedure.

In [14]:
# fit logistic regression model we treat as the truth
base_features, base_labels = env.initial_state.values()
num_agents, num_features = base_features.shape

lam = 1.0 / num_agents
theta_true, loss_list, smoothness = logistic_regression(base_features, base_labels, lam, 'Exact')

print('Accuracy: ', ((base_features.dot(theta_true) > 0)  == base_labels).mean())
print('Loss: ', loss_list[-1])
print('Condition Number: ', lam / (smoothness + lam))
print('Norm: ', np.linalg.norm(theta_true))
theta_true

Accuracy:  0.7208149479762488
Loss:  0.6100457881235787
Condition Number:  1.1674064432758535e-05
Norm:  2.449254451036605


array([-0.00685752, -0.38023149,  1.74532448, -0.04753519, -0.44431336,
        0.01090135,  1.55568933,  0.09787015,  0.1344509 ,  0.09312879,
       -0.38958459])

In [13]:
lam

5.447513210219535e-05

### Repeated ERM

We simulate the effect of performativity. Therefore we perform `num_iters` rounds. In each round the following three steps are performed: i) we train an ERM classifier on the current set of features, ii) the classifier is deployed and iii) the individuals react strategically to the deployed calssifier, inducing a new set of fetaures for the next round

a) configure experiment

In [17]:
# problems parameters
num_iters    = 25
eps_list = [1]#, 1]#, 50] #100]
num_eps  = len(eps_list)

# define which method to run: 'Exact' or 'RGD'
method = 'Exact'  

b) setup summary statistics

In [18]:
theta_list         = [[np.copy(theta_true)] for _ in range(num_eps)]
theta_gaps         = [[] for _ in range(num_eps)]
ll_list            = [[] for _ in range(num_eps)]
acc_list_start     = [[] for _ in range(num_eps)]
acc_list_end       = [[] for _ in range(num_eps)]
lp_list_start      = [[] for _ in range(num_eps)]
lp_list_end        = [[] for _ in range(num_eps)]

c) simulate experiment

In [None]:
for c, eps in enumerate(eps_list):
    
    # initial theta
    theta = np.copy(theta_true)

    print('Running epsilon =  {}\n'.format(eps))
    
    env.config.epsilon = eps
    env.config.l2_penalty = lam
    env.reset()
    
    for t in range(num_iters):
        
        (X_strat, Y), loss_start, _, _ = env.step(theta)
        
        # evaluate initial loss on the current distribution
        # performative loss value of previous theta
        acc = ((X_strat.dot(theta) > 0) == Y).mean()
        
        acc_list_start[c].append(acc)
        lp_list_start[c].append(loss_start)
        
        # learn on induced distribution
        theta_init = None if method == 'Exact' else np.copy(theta)
        
        theta_new, ll, _ = logistic_regression(X_strat, Y, lam, method, tol=1e-7, 
                                                                 theta_init=theta_init)
        
        print(t)
        print()
        print(theta_list)
        # keep track of statistics
        ll_list[c].append(ll)
        theta_list[c].append(np.copy(theta_new))

        # evaluate final loss on the current distribution
        loss_end = wn.credit.strategic_logistic_loss(env.config, X_strat, Y, theta_new)
        acc = ((X_strat.dot(theta_new) > 0) == Y).mean()
        
        lp_list_end[c].append(loss_end)        
        acc_list_end[c].append(acc)
        
        theta = np.copy(theta_new)

Running epsilon =  1

0

[[array([-0.00685752, -0.38023149,  1.74532448, -0.04753519, -0.44431336,
        0.01090135,  1.55568933,  0.09787015,  0.1344509 ,  0.09312879,
       -0.38958459])]]
1

[[array([-0.00685752, -0.38023149,  1.74532448, -0.04753519, -0.44431336,
        0.01090135,  1.55568933,  0.09787015,  0.1344509 ,  0.09312879,
       -0.38958459]), array([-0.00685855, -0.38023149,  1.7453067 , -0.04753594, -0.44432084,
        0.01089886,  1.55560498,  0.09787351,  0.13455141,  0.09312899,
       -0.37984058])]]
2

[[array([-0.00685752, -0.38023149,  1.74532448, -0.04753519, -0.44431336,
        0.01090135,  1.55568933,  0.09787015,  0.1344509 ,  0.09312879,
       -0.38958459]), array([-0.00685855, -0.38023149,  1.7453067 , -0.04753594, -0.44432084,
        0.01089886,  1.55560498,  0.09787351,  0.13455141,  0.09312899,
       -0.37984058]), array([-0.00685948, -0.38023152,  1.74529054, -0.04753663, -0.44432768,
        0.01089658,  1.55552742,  0.09787655,  0.13464267, 

## 3) Visualization of Results

In [None]:
import matplotlib.pyplot as plt
from matplotlib import rc
import matplotlib
import matplotlib.ticker as mtick
from matplotlib.ticker import FormatStrFormatter

matplotlib.rcParams['mathtext.fontset'] = 'cm'
matplotlib.rcParams['font.family'] = 'Times New Roman'

%matplotlib inline

 ### Perfromative Risk
 
 We visualize the perfromative risk during the repeated risk minimization procedure. We illustrate the risk at the beginning and at the end of each round, correcting the two values with a blue line and indicate the distribution shift with a dashed green line.

In [None]:
# visualize loss trajectory

for c in range(num_eps):
    plt.figure(figsize=(15,6))
    offset = 0.8
    plt.title('Perfromative Risk during {}, eps={}'.format(method,eps_list[c]))
    for i in range(2,num_iters):
        plt.plot([i,i+offset],[lp_list_start[c][i],lp_list_end[c][i]],'b*-')
        if i<num_iters-1:
            plt.plot([i+offset, i+1],[lp_list_end[c][i],lp_list_start[c][i+1]],'g--')

    plt.xlabel('Iteration',fontsize = 18)
    plt.ylabel('Loss',fontsize = 18) 
    plt.tick_params(labelsize=18)
    plt.yscale('log')

In [None]:
# visualize accuracy trajectory

for c in range(num_eps):
    fig = plt.figure(figsize=(15,6))
    ax = fig.gca()
    offset = 0.8
    
    for i in range(1,num_iters):
        # gain of RRM step
        plt.plot([i,i+offset],[acc_list_start[c][i],acc_list_end[c][i]],'b*-')
        if i<num_iters-1:
            plt.plot([i+offset, i+1],[acc_list_end[c][i],acc_list_start[c][i+1]],'g:')

    plt.xlabel('Iteration', fontsize = 18)
    plt.tick_params(labelsize=18)
    plt.title("Accuracy during {}, eps='{}'".format(method, eps_list[c]), fontsize = 18)